diff --git a/.bazelrc b/.bazelrc
index 2b80063fd59..d06e0836184 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -37,7 +37,6 @@
 #     v2: Build TF v2
 #
 # Feature and Third party library support options:
-#     xla:          Build TF with XLA
 #     using_cuda:   CUDA is available to build system.
 #     cuda:         Build with full cuda support.
 #     rocm:         Build with AMD GPU support (rocm).
@@ -227,6 +226,14 @@ build --noincompatible_remove_legacy_whole_archive
 # https://github.com/tensorflow/community/pull/179
 build --noincompatible_prohibit_aapt1
 
+# Enable XLA
+build --action_env=TF_ENABLE_XLA=1
+build --define=with_xla_support=true
+
+# Keep config XLA until all build scripts are cleaned up.
+build:xla --action_env=TF_ENABLE_XLA=1
+build:xla --define=with_xla_support=true
+
 # Modular TF build options
 build:dynamic_kernels --define=dynamic_loaded_kernels=true
 build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
@@ -312,10 +319,6 @@ build:v2 --action_env=TF2_BEHAVIOR=1
 build --config=v2
 test --config=v2
 
-# Enable XLA
-build:xla --action_env=TF_ENABLE_XLA=1
-build:xla --define=with_xla_support=true
-
 # BEGIN TF REMOTE BUILD EXECUTION OPTIONS
 # Options when using remote execution
 # WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
@@ -348,7 +351,6 @@ build:rbe_linux --host_java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8
 build:rbe_linux --java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8
 
 # Non-rbe settings we should include because we do not run configure
-build:rbe_linux --config=xla
 build:rbe_linux --config=avx_linux
 build:rbe_linux --config=short_logs
 # TODO(gunan): Check why we need this specified in rbe, but not in other builds.
@@ -386,9 +388,8 @@ build:rbe_linux_py2 --python_path="/usr/bin/python2"
 build:rbe_linux_py2 --repo_env=TF_PYTHON_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py"
 
 build:rbe_linux_py3 --config=rbe_linux
-build:rbe_linux_py3 --repo_env=PYTHON_BIN_PATH="/usr/bin/python3"
 build:rbe_linux_py3 --python_path="/usr/bin/python3"
-build:rbe_linux_py3 --repo_env=TF_PYTHON_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py3"
+build:rbe_linux_py3 --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-manylinux2010-py3_config_python"
 
 build:rbe_win --config=rbe
 build:rbe_win --crosstool_top="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_121:toolchain"
@@ -405,9 +406,7 @@ build:rbe_win --define=override_eigen_strong_inline=true
 build:rbe_win --jobs=500
 
 build:rbe_win_py37 --config=rbe
-build:rbe_win_py37 --repo_env=PYTHON_BIN_PATH=C:\\Python37\\python.exe
-build:rbe_win_py37 --repo_env=PYTHON_LIB_PATH=C:\\Python37\\lib\\site-packages
-build:rbe_win_py37 --repo_env=TF_PYTHON_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/win_1803/py37
+build:rbe_win_py37 --repo_env=TF_PYTHON_CONFIG_REPO="@windows_py37_config_python"
 build:rbe_win_py37 --python_path=C:\\Python37\\python.exe
 
 build:rbe_win_py38 --config=rbe
diff --git a/.gitignore b/.gitignore
index 99ba9312a92..eab8a64c63d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,7 @@ tensorflow/contrib/cmake/_build/
 /tensorflow/python/framework/fast_tensor_util.cpp
 /tensorflow/lite/gen/**
 /tensorflow/lite/tools/make/downloads/**
+/tensorflow/lite/tools/make/gen/**
 /api_init_files_list.txt
 /estimator_api_init_files_list.txt
 *.whl
diff --git a/README.md b/README.md
index 56baa0740c3..e95fea22c56 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ $ python
 3
 >>> hello = tf.constant('Hello, TensorFlow!')
 >>> hello.numpy()
-'Hello, TensorFlow!'
+b'Hello, TensorFlow!'
 ```
 
 For more examples, see the
diff --git a/configure.py b/configure.py
index ed09a693fd4..7f1a72886f2 100644
--- a/configure.py
+++ b/configure.py
@@ -1390,10 +1390,6 @@ def main():
   else:
     environ_cp['TF_CONFIGURE_IOS'] = '0'
 
-  xla_enabled_by_default = is_linux() or is_macos()
-  set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
-                xla_enabled_by_default, 'xla')
-
   set_action_env_var(
       environ_cp,
       'TF_NEED_OPENCL_SYCL',
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3a6c2eef1fe..30ae001caf7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -205,6 +205,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/platform:casts",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index b6a87cc616d..b045ed5b701 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -874,12 +874,12 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
 #endif  // !IS_MOBILE_PLATFORM
 }
 
-TF_CAPI_EXPORT extern void TFE_ContextClearRemoteExecutors(TFE_Context* ctx,
-                                                           TF_Status* status) {
+TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
+                                                TF_Status* status) {
 #if defined(IS_MOBILE_PLATFORM)
   status->status = tensorflow::Status::OK();
 #else   // !defined(IS_MOBILE_PLATFORM)
-  status->status = ctx->context->ClearRemoteExecutors();
+  status->status = ctx->context->SyncExecutors();
 #endif  // !IS_MOBILE_PLATFORM
 }
 
@@ -1450,6 +1450,25 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
   }
 }
 
+void TFE_OpSetAttrValueProto(const TFE_Op* op, const char* attr_name,
+                             const void* proto, size_t proto_len,
+                             TF_Status* status) {
+  tensorflow::AttrValue attr_value;
+  if (!attr_value.ParseFromArray(proto, proto_len)) {
+    status->status =
+        tensorflow::errors::InvalidArgument("Unparseable AttrValue proto");
+    return;
+  }
+  if (op == nullptr || op->operation == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Got a null or uninitialized `op` argument");
+    return;
+  }
+  auto operation = tensorflow::down_cast<tensorflow::OperationInterface*>(
+      op->operation.get());
+  operation->MutableAttrs()->Set(attr_name, attr_value);
+}
+
 TF_CAPI_EXPORT extern int TFE_OpGetInputLength(TFE_Op* op,
                                                const char* input_name,
                                                TF_Status* status) {
@@ -1606,7 +1625,7 @@ void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context->EndStep(); }
 void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs) {
   auto operation = tensorflow::down_cast<tensorflow::OperationInterface*>(
       op->operation.get());
-  *attrs = TFE_OpAttrs(&operation->Attrs());
+  *attrs = TFE_OpAttrs(&operation->Attrs(), op->operation->Name().c_str());
 }
 
 void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs) {
@@ -1620,6 +1639,14 @@ void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs) {
   }
 }
 
+void TFE_OpAttrsSerialize(const TFE_OpAttrs* attrs, TF_Buffer* buf,
+                          TF_Status* status) {
+  tensorflow::NameAttrList name_and_attrs;
+  attrs->attributes->FillAttrValueMap(name_and_attrs.mutable_attr());
+  name_and_attrs.set_name(attrs->name);
+  status->status = MessageToBuffer(name_and_attrs, buf);
+}
+
 namespace tensorflow {
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
                           const tensorflow::AttrValue& default_value,
@@ -1740,7 +1767,7 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
     }
     std::vector<TFE_TensorHandle*> outputs(*num_retvals);
     TF_Status status;
-    TFE_OpAttrs attributes(&op->Attrs());
+    TFE_OpAttrs attributes(&op->Attrs(), op->Name().c_str());
     device_.execute(inputs.size(), inputs.data(), op->Name().c_str(),
                     &attributes, num_retvals, outputs.data(), &status, info_);
     if (status.status.ok()) {
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 92dab6a36c6..b0f0da5acef 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -382,9 +382,11 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
                                                  const char* worker_name,
                                                  TF_Status* status);
 
-// Clear pending streaming requests and error statuses on remote executors.
-TF_CAPI_EXPORT extern void TFE_ContextClearRemoteExecutors(TFE_Context* ctx,
-                                                           TF_Status* status);
+// Sync pending nodes in local executors (including the context default executor
+// and thread executors) and streaming requests to remote executors, and get the
+// combined status.
+TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
+                                                TF_Status* status);
 
 // If the TensorHandle is copied to another device as part of an op execution,
 // the copy is destroyed after the op has executed. Enabling implicit mirroring
@@ -441,6 +443,21 @@ TF_CAPI_EXPORT extern void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs);
 // Does not overwrite or update existing attributes, but adds new ones.
 TF_CAPI_EXPORT extern void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs);
 
+// Serialize `attrs` as a tensorflow::NameAttrList protocol buffer (into `buf`),
+// containing the op name and a map of its attributes.
+TF_CAPI_EXPORT extern void TFE_OpAttrsSerialize(const TFE_OpAttrs* attrs,
+                                                TF_Buffer* buf,
+                                                TF_Status* status);
+
+// Set an op's attribute from a serialized AttrValue protocol buffer.
+//
+// Analogous to TF_SetAttrValueProto for building graph operations.
+TF_CAPI_EXPORT extern void TFE_OpSetAttrValueProto(const TFE_Op* op,
+                                                   const char* attr_name,
+                                                   const void* proto,
+                                                   size_t proto_len,
+                                                   TF_Status* status);
+
 #define TFE_CUSTOM_DEVICE_VERSION 1
 
 // Struct to be filled in
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 943890b6259..05b0a143025 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -236,12 +236,16 @@ struct TFE_Executor {
   tensorflow::EagerExecutor* unowned_executor;
 };
 
+// An equivalent of a tensorflow::NameAttrList protocol buffer, but used in ways
+// that sometimes do not require serialization.
 struct TFE_OpAttrs {
-  explicit TFE_OpAttrs() : attributes(nullptr) {}
+  explicit TFE_OpAttrs() : name(nullptr), attributes(nullptr) {}
 
-  explicit TFE_OpAttrs(const tensorflow::AttrBuilder* value)
-      : attributes(value) {}
+  explicit TFE_OpAttrs(const tensorflow::AttrBuilder* value,
+                       const char* op_name)
+      : name(op_name), attributes(value) {}
 
+  const char* name;
   const tensorflow::AttrBuilder* attributes;
 };
 
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 2f363a4f9a4..eb6b234e3df 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
@@ -127,7 +128,7 @@ void TestRemoteExecute(bool async) {
 TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
 TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
 
-void TestRemoteExecuteSilentCopies(bool async) {
+void TestRemoteExecuteSilentCopies(bool async, bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
   // This server def has the task index set to 0.
@@ -166,10 +167,14 @@ void TestRemoteExecuteSilentCopies(bool async) {
   auto* h1_task2 =
       TFE_TensorHandleCopyToDevice(h1_task0, ctx, task2_name, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandleEnableImplicitMirroring(h1_task2, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   // Handles are on task0 (local), and task2, but op is on task1.
   TFE_Op* matmul = MatMulOp(ctx, h0_task0, h1_task2);
-  TFE_OpSetDevice(matmul, task1_name, status);
+  if (remote) {
+    TFE_OpSetDevice(matmul, task1_name, status);
+  }
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_TensorHandle* retvals[1];
@@ -177,6 +182,17 @@ void TestRemoteExecuteSilentCopies(bool async) {
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
+  // TODO(gjn): Add support for waiting on async local mirrors
+  if (!async) {
+    auto remote_arg = tensorflow::down_cast<tensorflow::TensorHandleInterface*>(
+                          h1_task2->handle.get())
+                          ->Handle();
+    auto op = tensorflow::down_cast<tensorflow::OperationInterface*>(
+        matmul->operation.get());
+    // The input handles should never change since they have been mirrored.
+    ASSERT_EQ(op->GetInput(1), remote_arg);
+  }
+
   auto* retval_task0 = TFE_TensorHandleCopyToDevice(
       retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
@@ -213,9 +229,17 @@ void TestRemoteExecuteSilentCopies(bool async) {
   worker_server2.release();
 }
 
-TEST(CAPI, RemoteExecuteSilentCopies) { TestRemoteExecuteSilentCopies(false); }
+TEST(CAPI, RemoteExecuteSilentCopies) {
+  TestRemoteExecuteSilentCopies(false, true);
+}
 TEST(CAPI, RemoteExecuteSilentCopiesAsync) {
-  TestRemoteExecuteSilentCopies(true);
+  TestRemoteExecuteSilentCopies(true, true);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocal) {
+  TestRemoteExecuteSilentCopies(false, false);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsync) {
+  TestRemoteExecuteSilentCopies(true, false);
 }
 
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 04060b13885..67a2dde6c27 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -416,12 +416,23 @@ void TensorHandleSilentCopy(bool async,
                     hgpu->handle.get())
                     ->Handle();
 
-    // The input handles should never change since they have been mirrored.
     auto op = tensorflow::down_cast<tensorflow::OperationInterface*>(
         matmul->operation.get());
-    ASSERT_EQ(op->GetInput(0), arg0);
-    ASSERT_EQ(op->GetInput(1), arg1);
-
+    if (!async) {
+      // The input handles should never change since they have been mirrored.
+      ASSERT_EQ(op->GetInput(0), arg0);
+      ASSERT_EQ(op->GetInput(1), arg1);
+    } else {
+      if (cpu_op) {
+        ASSERT_EQ(op->GetInput(0), arg0);
+        // The GPU handle should be replaced with a CPU copy
+        ASSERT_NE(op->GetInput(1), arg1);
+      } else {
+        // The CPU handle should be replaced with a GPU copy
+        ASSERT_NE(op->GetInput(0), arg0);
+        ASSERT_EQ(op->GetInput(1), arg1);
+      }
+    }
     TFE_DeleteOp(matmul);
     TFE_DeleteTensorHandle(retvals[0]);
     TFE_DeleteTensorHandle(hgpu);
@@ -1578,4 +1589,52 @@ TEST(CAPI, TestTFE_OpGetAttrs) {
   TFE_DeleteContext(ctx);
 }
 
+TEST(CAPI, TestTFE_OpAttrsSerialize) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_Op* var_op = TFE_NewOp(ctx, "VarHandleOp", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(var_op, "dtype", TF_INT64);
+  TFE_OpSetAttrShape(var_op, "shape", {}, 0, status);
+  TFE_OpAttrs attributes;
+  TFE_OpGetAttrs(var_op, &attributes);
+
+  TF_Buffer* serialized_attr_values = TF_NewBuffer();
+  TFE_OpAttrsSerialize(&attributes, serialized_attr_values, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  tensorflow::NameAttrList name_and_attrs;
+  ASSERT_TRUE(name_and_attrs.ParseFromArray(serialized_attr_values->data,
+                                            serialized_attr_values->length));
+  ASSERT_EQ("VarHandleOp", name_and_attrs.name());
+  ASSERT_EQ(tensorflow::DT_INT64,
+            name_and_attrs.attr().find("dtype")->second.type());
+  TF_DeleteBuffer(serialized_attr_values);
+
+  TFE_Op* second_var_op = TFE_NewOp(ctx, "VarHandleOp", status);
+
+  string serialized_dtype;
+  ASSERT_TRUE(name_and_attrs.attr().find("dtype")->second.SerializeToString(
+      &serialized_dtype));
+  TFE_OpSetAttrValueProto(
+      second_var_op, "dtype",
+      reinterpret_cast<const void*>(serialized_dtype.c_str()),
+      serialized_dtype.length(), status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  tensorflow::AttrValueMap attr_values;
+  auto op = tensorflow::down_cast<tensorflow::OperationInterface*>(
+      second_var_op->operation.get());
+  op->Attrs().FillAttrValueMap(&attr_values);
+  EXPECT_EQ(tensorflow::DT_INT64, attr_values.find("dtype")->second.type());
+
+  TF_DeleteStatus(status);
+  TFE_DeleteOp(var_op);
+  TFE_DeleteOp(second_var_op);
+  TFE_DeleteContext(ctx);
+}
+
 }  // namespace
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index e680cc72b3b..882b4032f76 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -68,6 +68,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index e898664c221..bc630bcaede 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -21,15 +21,22 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-constexpr char kTestDataPbTxt[] =
-    "cc/saved_model/testdata/half_plus_two_pbtxt/00000123";
-constexpr char kTestDataSharded[] =
-    "cc/saved_model/testdata/half_plus_two/00000123";
+string TestDataPbTxt() {
+  return io::JoinPath("tensorflow", "cc", "saved_model", "testdata",
+                      "half_plus_two_pbtxt", "00000123");
+}
+
+string TestDataSharded() {
+  return io::JoinPath("tensorflow", "cc", "saved_model", "testdata",
+                      "half_plus_two", "00000123");
+}
 
 class ReaderTest : public ::testing::Test {
  protected:
@@ -49,8 +56,7 @@ class ReaderTest : public ::testing::Test {
 TEST_F(ReaderTest, TagMatch) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
                                               &meta_graph_def));
   CheckMetaGraphDef(meta_graph_def);
@@ -59,8 +65,7 @@ TEST_F(ReaderTest, TagMatch) {
 TEST_F(ReaderTest, NoTagMatch) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
   Status st = ReadMetaGraphDefFromSavedModel(export_dir, {"missing-tag"},
                                              &meta_graph_def);
   EXPECT_FALSE(st.ok());
@@ -73,8 +78,7 @@ TEST_F(ReaderTest, NoTagMatch) {
 TEST_F(ReaderTest, NoTagMatchMultiple) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
   Status st = ReadMetaGraphDefFromSavedModel(
       export_dir, {kSavedModelTagServe, "missing-tag"}, &meta_graph_def);
   EXPECT_FALSE(st.ok());
@@ -87,8 +91,7 @@ TEST_F(ReaderTest, NoTagMatchMultiple) {
 TEST_F(ReaderTest, PbtxtFormat) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPbTxt);
+  const string export_dir = GetDataDependencyFilepath(TestDataPbTxt());
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
                                               &meta_graph_def));
   CheckMetaGraphDef(meta_graph_def);
@@ -97,8 +100,7 @@ TEST_F(ReaderTest, PbtxtFormat) {
 TEST_F(ReaderTest, InvalidExportPath) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  const string export_dir = GetDataDependencyFilepath("missing-path");
   Status st = ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
                                              &meta_graph_def);
   EXPECT_FALSE(st.ok());
diff --git a/tensorflow/compiler/mlir/g3doc/README.md b/tensorflow/compiler/mlir/g3doc/README.md
new file mode 100644
index 00000000000..39734828d19
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/README.md
@@ -0,0 +1,3 @@
+# TensorFlow MLIR
+
+These are the docs for: https://www.tensorflow.org/mlir
diff --git a/tensorflow/compiler/mlir/g3doc/_book.yaml b/tensorflow/compiler/mlir/g3doc/_book.yaml
new file mode 100644
index 00000000000..a75a2137536
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_book.yaml
@@ -0,0 +1,26 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+- include: /api_docs/_upper_tabs_api.yaml
+# Dropdown menu
+- name: Resources
+  path: /resources
+  is_default: true
+  menu:
+  - include: /resources/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide
+      contents:
+      - title: Overview
+        path: /mlir/overview
+      - heading: Dialects
+      - title: Overview
+        path: /mlir/dialects
+      - title: TensorFlow
+        path: /mlir/tf_ops
+      - title: TensorFlow Lite
+        path: /mlir/tfl_ops
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/mlir/g3doc/_index.yaml b/tensorflow/compiler/mlir/g3doc/_index.yaml
new file mode 100644
index 00000000000..affd0926af5
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_index.yaml
@@ -0,0 +1,54 @@
+book_path: /mlir/_book.yaml
+project_path: /mlir/_project.yaml
+description: <!--no description-->
+landing_page:
+  custom_css_path: /site-assets/css/style.css
+  rows:
+  - heading: MLIR unifies the infrastructure for high-performance ML models in TensorFlow.
+    items:
+    - description: >
+        The <a href="https://mlir.llvm.org/" class="external">MLIR</a> project defines a common
+        intermediate representation (IR) that unifies the infrastructure required to execute high
+        performance machine learning models in TensorFlow and similar ML frameworks. This project
+        will include the application of HPC techniques, along with integration of
+        search algorithms like reinforcement learning. MLIR aims to reduce the
+        cost to bring up new hardware, and improve usability for existing
+        TensorFlow users.
+
+    - code_block: |
+        <pre class = "prettyprint">
+        // Syntactically similar to LLVM:
+        func @testFunction(%arg0: i32) {
+          %x = call @thingToCall(%arg0) : (i32) -> i32
+          br ^bb1
+        ^bb1:
+          %y = addi %x, %x : i32
+          return %y : i32
+        }
+        </pre>
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: "Multi-Level Intermediate Representation for Compiler Infrastructure"
+      youtube_id: qzljG6DKgic
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=qzljG6DKgic
+    - heading: "A new intermediate representation and compiler framework"
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://blog.tensorflow.org/2019/04/mlir-new-intermediate-representation.html
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://blog.tensorflow.org/2019/04/mlir-new-intermediate-representation.html
+    - heading: MLIR on GitHub
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/llvm/llvm-project/tree/master/mlir
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/llvm/llvm-project/tree/master/mlir
+    - heading: TensorFlow MLIR on GitHub
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir
diff --git a/tensorflow/compiler/mlir/g3doc/dialects.md b/tensorflow/compiler/mlir/g3doc/dialects.md
new file mode 100644
index 00000000000..fa6c4605b27
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/dialects.md
@@ -0,0 +1,37 @@
+# MLIR dialects
+
+## Overview
+
+
+To separate different hardware and software targets, MLIR has “dialects”,
+including:
+
+* TensorFlow IR, which represents all things possible in TensorFlow graphs.
+* XLA HLO IR, which is designed to take advantage of XLA’s compilation
+  abilities (with output to, among other things, TPUs).
+* An experimental affine dialect, which focuses on
+  [polyhedral representations](https://en.wikipedia.org/wiki/Polytope_model)
+  and optimizations.
+* LLVM IR, which has a 1:1 mapping between it and LLVM’s own representation,
+  allowing MLIR to emit GPU and CPU code through LLVM.
+* TensorFlow Lite, which will translate to running code on mobile platforms.
+
+Each dialect consists of a set of defined operations which have invariants
+placed on them, like: “This is a binary operator, and the inputs and outputs
+have the same types.”
+
+## Adding to MLIR
+
+MLIR has no fixed/built-in list of globally known operations (no “intrinsics”).
+Dialects can define entirely custom types, which is how MLIR can model things
+like the LLVM IR type system (which has first class aggregates), domain
+abstractions important for ML-optimized accelerators like quantized types, and
+even the Swift or Clang type systems (which are built around Swift/Clang
+declaration nodes) in the future.
+
+If you want to connect a new low-level compiler, you would create a new dialect
+and the lowerings between the TensorFlow Graph dialect and your dialect.
+This smooths the path for hardware and compiler makers. You can even target
+dialects at different levels in the same model; the higher-level optimizers
+will respect the unfamiliar parts of the IR and wait for a lower level to handle
+it.
diff --git a/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg b/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
new file mode 100644
index 00000000000..aec0986ba02
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 960.0 720.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"/><path fill="#434343" d="m8.131233 225.34946l0 0c0 -10.332718 8.376322 -18.70903 18.709036 -18.70903l74.83389 0l0 0c4.9619446 0 9.720665 1.9711151 13.229286 5.4797363c3.5086288 3.5086365 5.4797516 8.267349 5.4797516 13.229294l0 394.14096c0 10.332764 -8.37632 18.709045 -18.709038 18.709045l-74.83389 0c-10.332714 0 -18.709036 -8.376282 -18.709036 -18.709045z" fill-rule="evenodd"/><path fill="#ffffff" d="m55.68145 377.67307l2.3125 0l2.53125 6.703125l0.109375 0l2.53125 -6.703125l2.34375 0l0 9.546875l-1.796875 0l0 -4.703125l0.109375 -1.59375l-0.109375 0l-2.421875 6.296875l-1.40625 0l-2.4375 -6.296875l-0.09375 0l0.09375 1.59375l0 4.703125l-1.765625 0l0 -9.546875zm11.99527 0l1.796875 0l0 7.84375l4.046875 0l0 1.703125l-5.84375 0l0 -9.546875zm6.2500153 9.546875l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m25.853401 393.67307l6.046875 0l0 1.703125l-4.25 0l0 2.328125l3.828125 0l0 1.703125l-3.828125 0l0 3.8125l-1.796875 0l0 -9.546875zm7.135956 2.75l1.640625 0l0 0.90625l0.09375 0q0.265625 -0.484375 0.796875 -0.796875q0.53125 -0.3125 1.171875 -0.3125q0.46875 0 0.84375 0.15625l0 1.734375q-0.359375 -0.125 -0.609375 -0.1875q-0.234375 -0.0625 -0.515625 -0.0625q-0.78125 0 -1.234375 0.5625q-0.453125 0.5625 -0.453125 1.40625l0 3.390625l-1.734375 0l0 -6.796875zm7.5558777 7.015625q-1.078125 0 -1.78125 -0.625q-0.6875 -0.640625 -0.6875 -1.671875q0 -0.6875 0.359375 -1.203125q0.359375 -0.515625 0.984375 -0.796875q0.625 -0.28125 1.390625 -0.28125q1.046875 0 1.796875 0.3125l0 -0.296875q0 -0.5625 -0.421875 -0.90625q-0.421875 -0.34375 -1.125 -0.34375q-0.484375 0 -0.9375 0.21875q-0.4375 0.21875 -0.734375 0.578125l-1.109375 -0.875q0.484375 -0.640625 1.234375 -0.984375q0.75 -0.359375 1.625 -0.359375q1.5625 0 2.375 0.71875q0.8125 0.71875 0.8125 2.109375l0 4.1875l-1.71875 0l0 -0.6875l-0.109375 0q-0.3125 0.390625 -0.8125 0.65625q-0.5 0.25 -1.140625 0.25zm0.40625 -1.359375q0.765625 0 1.203125 -0.484375q0.453125 -0.5 0.453125 -1.140625q-0.6875 -0.328125 -1.421875 -0.328125q-1.359375 0 -1.359375 1.015625q0 0.40625 0.28125 0.671875q0.296875 0.265625 0.84375 0.265625zm4.992676 -5.65625l1.640625 0l0 0.859375l0.09375 0q0.328125 -0.5 0.859375 -0.78125q0.546875 -0.296875 1.21875 -0.296875q0.71875 0 1.25 0.34375q0.53125 0.34375 0.75 0.859375q0.34375 -0.53125 0.921875 -0.859375q0.578125 -0.34375 1.359375 -0.34375q1.15625 0 1.765625 0.71875q0.609375 0.71875 0.609375 1.921875l0 4.375l-1.734375 0l0 -4.046875q0 -0.640625 -0.296875 -0.984375q-0.296875 -0.359375 -0.828125 -0.359375q-0.6875 0 -1.09375 0.546875q-0.40625 0.53125 -0.40625 1.421875l0 3.421875l-1.75 0l0 -4.046875q0 -0.640625 -0.3125 -0.984375q-0.3125 -0.359375 -0.875 -0.359375q-0.640625 0 -1.046875 0.546875q-0.390625 0.53125 -0.390625 1.421875l0 3.421875l-1.734375 0l0 -6.796875zm15.313751 7.015625q-1.0 0 -1.8125 -0.46875q-0.8125 -0.46875 -1.28125 -1.28125q-0.453125 -0.828125 -0.453125 -1.859375q0 -0.96875 0.453125 -1.796875q0.453125 -0.84375 1.25 -1.328125q0.796875 -0.5 1.78125 -0.5q1.0625 0 1.8125 0.453125q0.765625 0.453125 1.15625 1.25q0.390625 0.78125 0.390625 1.734375q0 0.3125 -0.03125 0.59375l-5.109375 0q0.125 0.8125 0.640625 1.234375q0.515625 0.421875 1.234375 0.421875q0.609375 0 1.03125 -0.265625q0.4375 -0.265625 0.6875 -0.703125l1.421875 0.703125q-1.046875 1.8125 -3.171875 1.8125zm1.625 -4.4375q-0.03125 -0.328125 -0.234375 -0.640625q-0.203125 -0.3125 -0.578125 -0.515625q-0.359375 -0.203125 -0.859375 -0.203125q-0.625 0 -1.078125 0.375q-0.4375 0.359375 -0.640625 0.984375l3.390625 0zm2.1277466 -2.578125l1.953125 0l1.171875 4.40625l0.09375 0l1.359375 -4.40625l1.796875 0l1.359375 4.40625l0.09375 0l1.171875 -4.40625l1.921875 0l-2.171875 6.796875l-1.875 0l-1.390625 -4.390625l-0.09375 0l-1.359375 4.390625l-1.875 0l-2.15625 -6.796875zm14.984573 7.015625q-1.0625 0 -1.890625 -0.46875q-0.8125 -0.484375 -1.28125 -1.296875q-0.453125 -0.828125 -0.453125 -1.859375q0 -1.0 0.453125 -1.828125q0.46875 -0.828125 1.28125 -1.296875q0.828125 -0.484375 1.890625 -0.484375q1.03125 0 1.859375 0.484375q0.828125 0.46875 1.28125 1.296875q0.46875 0.828125 0.46875 1.828125q0 1.03125 -0.46875 1.859375q-0.453125 0.8125 -1.28125 1.296875q-0.828125 0.46875 -1.859375 0.46875zm0 -1.625q0.5 0 0.921875 -0.234375q0.4375 -0.234375 0.6875 -0.6875q0.25 -0.453125 0.25 -1.078125q0 -0.59375 -0.25 -1.046875q-0.25 -0.453125 -0.6875 -0.6875q-0.421875 -0.25 -0.921875 -0.25q-0.515625 0 -0.9375 0.25q-0.421875 0.234375 -0.6875 0.6875q-0.265625 0.453125 -0.265625 1.046875q0 0.609375 0.265625 1.078125q0.265625 0.453125 0.6875 0.6875q0.421875 0.234375 0.9375 0.234375zm4.813431 -5.390625l1.640625 0l0 0.90625l0.09375 0q0.265625 -0.484375 0.796875 -0.796875q0.53125 -0.3125 1.171875 -0.3125q0.46875 0 0.84375 0.15625l0 1.734375q-0.359375 -0.125 -0.609375 -0.1875q-0.234375 -0.0625 -0.515625 -0.0625q-0.78125 0 -1.234375 0.5625q-0.453125 0.5625 -0.453125 1.40625l0 3.390625l-1.734375 0l0 -6.796875zm5.7577515 -2.75l1.734375 0l0 5.203125l0.09375 0l2.4375 -2.453125l2.15625 0l0 0.109375l-2.578125 2.515625l2.6875 4.0625l0 0.109375l-2.0625 0l-1.84375 -2.96875l-0.890625 0.859375l0 2.109375l-1.734375 0l0 -9.546875zm9.895187 9.765625q-1.234375 0 -2.0 -0.5q-0.765625 -0.515625 -1.078125 -1.328125l1.5625 -0.6875q0.203125 0.484375 0.59375 0.734375q0.40625 0.25 0.921875 0.25q0.484375 0 0.796875 -0.15625q0.328125 -0.15625 0.328125 -0.5q0 -0.34375 -0.296875 -0.5q-0.296875 -0.171875 -0.875 -0.296875l-0.8125 -0.1875q-0.8125 -0.1875 -1.359375 -0.71875q-0.546875 -0.53125 -0.546875 -1.3125q0 -0.59375 0.34375 -1.046875q0.359375 -0.46875 0.953125 -0.71875q0.609375 -0.265625 1.34375 -0.265625q2.109375 0 2.8125 1.484375l-1.484375 0.65625q-0.40625 -0.71875 -1.28125 -0.71875q-0.453125 0 -0.71875 0.171875q-0.265625 0.15625 -0.265625 0.40625q0 0.484375 0.90625 0.71875l1.0 0.25q1.015625 0.25 1.515625 0.765625q0.515625 0.515625 0.515625 1.28125q0 0.65625 -0.375 1.15625q-0.375 0.5 -1.046875 0.78125q-0.65625 0.28125 -1.453125 0.28125z" fill-rule="nonzero"/><path fill="#ffffff" d="m54.497314 425.67307l5.515625 0l0 1.078125l-4.375 0l0 3.171875l3.953125 0l0 1.0625l-3.953125 0l0 3.15625l4.375 0l0 1.078125l-5.515625 0l0 -9.546875zm7.8750153 9.625q-0.375 0 -0.625 -0.234375q-0.234375 -0.25 -0.234375 -0.609375q0 -0.359375 0.234375 -0.59375q0.25 -0.25 0.625 -0.25q0.359375 0 0.59375 0.25q0.25 0.234375 0.25 0.59375q0 0.359375 -0.25 0.609375q-0.234375 0.234375 -0.59375 0.234375zm5.4064636 3.015625q-1.203125 0 -1.984375 -0.5625q-0.78125 -0.5625 -1.046875 -1.34375l1.046875 -0.4375q0.203125 0.578125 0.734375 0.9375q0.53125 0.375 1.25 0.375q1.046875 0 1.625 -0.609375q0.578125 -0.609375 0.578125 -1.734375l0 -0.765625l-0.046875 0q-0.328125 0.53125 -0.9375 0.859375q-0.609375 0.328125 -1.375 0.328125q-0.890625 0 -1.625 -0.453125q-0.71875 -0.453125 -1.140625 -1.265625q-0.421875 -0.828125 -0.421875 -1.859375q0 -1.03125 0.421875 -1.84375q0.421875 -0.828125 1.140625 -1.28125q0.734375 -0.453125 1.625 -0.453125q0.765625 0 1.375 0.328125q0.609375 0.328125 0.9375 0.859375l0.046875 0l0 -0.96875l1.09375 0l0 6.53125q0 1.640625 -0.921875 2.5q-0.90625 0.859375 -2.375 0.859375zm0 -3.984375q0.609375 0 1.109375 -0.296875q0.5 -0.3125 0.796875 -0.890625q0.296875 -0.578125 0.296875 -1.359375q0 -0.796875 -0.296875 -1.375q-0.296875 -0.578125 -0.796875 -0.875q-0.5 -0.296875 -1.109375 -0.296875q-0.609375 0 -1.125 0.3125q-0.5 0.296875 -0.8125 0.875q-0.296875 0.578125 -0.296875 1.359375q0 0.78125 0.296875 1.359375q0.3125 0.578125 0.8125 0.890625q0.515625 0.296875 1.125 0.296875zm5.7224884 0.96875q-0.375 0 -0.625 -0.234375q-0.234375 -0.25 -0.234375 -0.609375q0 -0.359375 0.234375 -0.59375q0.25 -0.25 0.625 -0.25q0.359375 0 0.59375 0.25q0.25 0.234375 0.25 0.59375q0 0.359375 -0.25 0.609375q-0.234375 0.234375 -0.59375 0.234375zm1.5783386 -0.078125l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m31.78949 442.7512l-2.6875 0l0 -1.078125l6.484375 0l0 1.078125l-2.671875 0l0 8.46875l-1.125 0l0 -8.46875zm6.931961 8.6875q-0.96875 0 -1.75 -0.46875q-0.765625 -0.46875 -1.203125 -1.28125q-0.421875 -0.828125 -0.421875 -1.859375q0 -0.96875 0.40625 -1.796875q0.40625 -0.84375 1.15625 -1.328125q0.75 -0.5 1.71875 -0.5q1.0 0 1.734375 0.453125q0.734375 0.4375 1.125 1.234375q0.40625 0.78125 0.40625 1.796875q0 0.15625 -0.03125 0.34375l-5.390625 0q0.046875 0.78125 0.375 1.3125q0.328125 0.53125 0.84375 0.796875q0.515625 0.265625 1.078125 0.265625q1.328125 0 2.015625 -1.234375l0.953125 0.46875q-0.421875 0.8125 -1.1875 1.3125q-0.75 0.484375 -1.828125 0.484375zm1.96875 -4.34375q-0.03125 -0.421875 -0.25 -0.84375q-0.203125 -0.421875 -0.65625 -0.71875q-0.453125 -0.296875 -1.15625 -0.296875q-0.796875 0 -1.359375 0.515625q-0.546875 0.515625 -0.734375 1.34375l4.15625 0zm2.519333 -2.671875l1.078125 0l0 1.0l0.046875 0q0.28125 -0.5 0.890625 -0.859375q0.625 -0.359375 1.3125 -0.359375q1.21875 0 1.859375 0.71875q0.65625 0.71875 0.65625 1.9375l0 4.359375l-1.125 0l0 -4.1875q0 -0.9375 -0.453125 -1.359375q-0.453125 -0.4375 -1.25 -0.4375q-0.546875 0 -0.984375 0.3125q-0.421875 0.296875 -0.671875 0.796875q-0.234375 0.5 -0.234375 1.046875l0 3.828125l-1.125 0l0 -6.796875zm9.785385 7.015625q-1.09375 0 -1.828125 -0.515625q-0.734375 -0.515625 -1.046875 -1.3125l1.015625 -0.453125q0.25 0.609375 0.75 0.953125q0.5 0.328125 1.109375 0.328125q0.640625 0 1.09375 -0.265625q0.453125 -0.265625 0.453125 -0.734375q0 -0.4375 -0.375 -0.6875q-0.359375 -0.265625 -1.125 -0.453125l-0.828125 -0.203125q-0.8125 -0.203125 -1.328125 -0.6875q-0.515625 -0.484375 -0.515625 -1.234375q0 -0.59375 0.34375 -1.03125q0.359375 -0.453125 0.9375 -0.6875q0.59375 -0.25 1.265625 -0.25q0.890625 0 1.59375 0.390625q0.703125 0.390625 1.0 1.078125l-0.984375 0.453125q-0.453125 -0.90625 -1.625 -0.90625q-0.546875 0 -0.96875 0.265625q-0.421875 0.25 -0.421875 0.65625q0 0.375 0.28125 0.609375q0.296875 0.234375 0.890625 0.375l0.984375 0.25q1.0 0.265625 1.5 0.765625q0.515625 0.5 0.515625 1.21875q0 0.625 -0.359375 1.09375q-0.359375 0.46875 -0.984375 0.734375q-0.609375 0.25 -1.34375 0.25zm7.1208496 0q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm4.685196 -5.984375l1.078125 0l0 1.09375l0.046875 0q0.203125 -0.5625 0.765625 -0.921875q0.578125 -0.375 1.203125 -0.375q0.46875 0 0.8125 0.140625l0 1.21875q-0.4375 -0.203125 -0.96875 -0.203125q-0.484375 0 -0.90625 0.28125q-0.40625 0.265625 -0.65625 0.75q-0.25 0.46875 -0.25 1.015625l0 3.796875l-1.125 0l0 -6.796875zm5.2167816 -2.75l5.515625 0l0 1.078125l-4.375 0l0 3.265625l3.953125 0l0 1.078125l-3.953125 0l0 4.125l-1.140625 0l0 -9.546875zm6.8584595 0l1.125 0l0 9.546875l-1.125 0l0 -9.546875zm5.8705597 9.765625q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm3.935196 -5.984375l1.171875 0l1.609375 5.375l0.015625 0l1.71875 -5.375l1.15625 0l1.71875 5.359375l0.015625 0l1.609375 -5.359375l1.15625 0l-2.203125 6.796875l-1.140625 0l-1.765625 -5.421875l-1.75 5.421875l-1.125 0l-2.1875 -6.796875zm10.549667 7.90625l0.71875 -1.109375q-0.25 -0.046875 -0.453125 -0.25q-0.1875 -0.21875 -0.1875 -0.515625q0 -0.359375 0.25 -0.59375q0.25 -0.25 0.609375 -0.25q0.359375 0 0.609375 0.25q0.25 0.234375 0.25 0.59375q0 0.21875 -0.09375 0.421875q-0.09375 0.203125 -0.28125 0.484375l-0.890625 1.328125l-0.53125 -0.359375zm2.5245667 -1.109375l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m30.26062 457.67307l3.21875 0q0.796875 0 1.46875 0.359375q0.6875 0.359375 1.09375 1.015625q0.40625 0.640625 0.40625 1.453125q0 0.8125 -0.40625 1.46875q-0.40625 0.640625 -1.09375 1.0q-0.671875 0.359375 -1.46875 0.359375l-2.078125 0l0 3.890625l-1.140625 0l0 -9.546875zm3.25 4.578125q0.53125 0 0.9375 -0.25q0.40625 -0.265625 0.625 -0.65625q0.234375 -0.40625 0.234375 -0.84375q0 -0.4375 -0.234375 -0.828125q-0.21875 -0.40625 -0.625 -0.65625q-0.40625 -0.265625 -0.9375 -0.265625l-2.109375 0l0 3.5l2.109375 0zm6.2863464 4.546875l-2.8125 -6.375l1.21875 0l2.140625 5.046875l0.015625 0l2.0625 -5.046875l1.21875 0l-4.1875 9.6719055l-1.171875 0l1.515625 -3.2969055zm6.986664 -8.046875l-2.6875 0l0 -1.078125l6.484375 0l0 1.078125l-2.671875 0l0 8.46875l-1.125 0l0 -8.46875zm7.041336 8.6875q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm4.685196 -5.984375l1.078125 0l0 1.09375l0.046875 0q0.203125 -0.5625 0.765625 -0.921875q0.578125 -0.375 1.203125 -0.375q0.46875 0 0.8125 0.140625l0 1.21875q-0.4375 -0.203125 -0.96875 -0.203125q-0.484375 0 -0.90625 0.28125q-0.40625 0.265625 -0.65625 0.75q-0.25 0.46875 -0.25 1.015625l0 3.796875l-1.125 0l0 -6.796875zm7.6761627 7.015625q-0.984375 0 -1.78125 -0.46875q-0.78125 -0.484375 -1.21875 -1.296875q-0.4375 -0.828125 -0.4375 -1.859375q0 -1.015625 0.4375 -1.84375q0.4375 -0.828125 1.21875 -1.296875q0.796875 -0.46875 1.78125 -0.46875q1.109375 0 1.859375 0.515625q0.75 0.515625 1.0625 1.375l-1.015625 0.421875q-0.25 -0.625 -0.75 -0.953125q-0.5 -0.328125 -1.21875 -0.328125q-0.59375 0 -1.125 0.328125q-0.515625 0.328125 -0.828125 0.921875q-0.3125 0.578125 -0.3125 1.328125q0 0.765625 0.3125 1.359375q0.3125 0.578125 0.828125 0.90625q0.53125 0.328125 1.125 0.328125q0.71875 0 1.234375 -0.328125q0.53125 -0.34375 0.78125 -0.953125l1.015625 0.421875q-0.34375 0.84375 -1.109375 1.375q-0.765625 0.515625 -1.859375 0.515625zm4.199005 -9.765625l1.125 0l0 2.8125l-0.046875 0.9375l0.046875 0q0.28125 -0.515625 0.890625 -0.859375q0.609375 -0.359375 1.328125 -0.359375q1.234375 0 1.890625 0.734375q0.65625 0.71875 0.65625 1.921875l0 4.359375l-1.140625 0l0 -4.1875q0 -0.921875 -0.46875 -1.359375q-0.453125 -0.4375 -1.203125 -0.4375q-0.53125 0 -0.984375 0.328125q-0.4375 0.3125 -0.703125 0.828125q-0.265625 0.5 -0.265625 1.046875l0 3.78125l-1.125 0l0 -9.546875zm6.673279 9.546875l0 0zm6.9514923 0.21875q-0.96875 0 -1.75 -0.46875q-0.765625 -0.46875 -1.203125 -1.28125q-0.421875 -0.828125 -0.421875 -1.859375q0 -0.96875 0.40625 -1.796875q0.40625 -0.84375 1.15625 -1.328125q0.75 -0.5 1.71875 -0.5q1.0 0 1.734375 0.453125q0.734375 0.4375 1.125 1.234375q0.40625 0.78125 0.40625 1.796875q0 0.15625 -0.03125 0.34375l-5.390625 0q0.046875 0.78125 0.375 1.3125q0.328125 0.53125 0.84375 0.796875q0.515625 0.265625 1.078125 0.265625q1.328125 0 2.015625 -1.234375l0.953125 0.46875q-0.421875 0.8125 -1.1875 1.3125q-0.75 0.484375 -1.828125 0.484375zm1.96875 -4.34375q-0.03125 -0.421875 -0.25 -0.84375q-0.203125 -0.421875 -0.65625 -0.71875q-0.453125 -0.296875 -1.15625 -0.296875q-0.796875 0 -1.359375 0.515625q-0.546875 0.515625 -0.734375 1.34375l4.15625 0zm4.928711 4.234375q-0.421875 0 -0.78125 -0.125q-0.34375 -0.140625 -0.578125 -0.375q-0.53125 -0.5 -0.53125 -1.390625l0 -3.984375l-1.1875 0l0 -1.03125l1.1875 0l0 -1.921875l1.125 0l0 1.921875l1.671875 0l0 1.03125l-1.671875 0l0 3.703125q0 0.5625 0.21875 0.828125q0.25 0.28125 0.734375 0.28125q0.40625 0 0.75 -0.21875l0 1.109375q-0.203125 0.09375 -0.421875 0.125q-0.203125 0.046875 -0.515625 0.046875zm5.0534973 0.109375q-0.984375 0 -1.78125 -0.46875q-0.78125 -0.484375 -1.21875 -1.296875q-0.4375 -0.828125 -0.4375 -1.859375q0 -1.015625 0.4375 -1.84375q0.4375 -0.828125 1.21875 -1.296875q0.796875 -0.46875 1.78125 -0.46875q1.109375 0 1.859375 0.515625q0.75 0.515625 1.0625 1.375l-1.015625 0.421875q-0.25 -0.625 -0.75 -0.953125q-0.5 -0.328125 -1.21875 -0.328125q-0.59375 0 -1.125 0.328125q-0.515625 0.328125 -0.828125 0.921875q-0.3125 0.578125 -0.3125 1.328125q0 0.765625 0.3125 1.359375q0.3125 0.578125 0.828125 0.90625q0.53125 0.328125 1.125 0.328125q0.71875 0 1.234375 -0.328125q0.53125 -0.34375 0.78125 -0.953125l1.015625 0.421875q-0.34375 0.84375 -1.109375 1.375q-0.765625 0.515625 -1.859375 0.515625zm3.3552551 -0.21875l0 0z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.4755 221.85857l0 0c0 -8.404755 6.8134003 -15.21814 15.218155 -15.21814l126.57158 0c4.0361023 0 7.906891 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4572754 6.7247467 4.4572754 10.760849l0 60.870773c0 8.404755 -6.813385 15.21817 -15.21814 15.21817l-126.57158 0c-8.404755 0 -15.218155 -6.8134155 -15.218155 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m216.92953 248.51083q-1.921875 0 -3.53125 -0.90625q-1.609375 -0.90625 -2.5625 -2.5q-0.953125 -1.59375 -0.953125 -3.5625q0 -1.984375 0.953125 -3.578125q0.953125 -1.59375 2.5625 -2.5q1.609375 -0.90625 3.53125 -0.90625q1.578125 0 2.875 0.546875q1.296875 0.546875 2.21875 1.5625l-1.734375 1.703125q-0.671875 -0.71875 -1.484375 -1.0625q-0.8125 -0.359375 -1.890625 -0.359375q-1.234375 0 -2.265625 0.578125q-1.015625 0.5625 -1.625 1.625q-0.609375 1.046875 -0.609375 2.390625q0 1.34375 0.609375 2.390625q0.625 1.046875 1.65625 1.625q1.046875 0.5625 2.265625 0.5625q1.875 0 3.0625 -1.09375q0.375 -0.34375 0.65625 -0.859375q0.28125 -0.53125 0.40625 -1.140625l-4.1875 0l0 -2.140625l6.546875 0q0.125 0.515625 0.125 1.171875q0 1.328125 -0.40625 2.46875q-0.390625 1.125 -1.21875 1.984375q-0.890625 0.96875 -2.15625 1.484375q-1.265625 0.515625 -2.84375 0.515625zm8.317978 -9.8125l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm10.589432 9.8125q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm6.903656 -7.90625l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm6.713928 -11.40625l2.453125 0l0 3.375l-0.15625 1.65625l0.15625 0q0.40625 -0.671875 1.1875 -1.078125q0.78125 -0.40625 1.703125 -0.40625q1.734375 0 2.65625 1.03125q0.921875 1.03125 0.921875 2.875l0 5.90625l-2.4375 0l0 -5.609375q0 -0.921875 -0.5 -1.421875q-0.5 -0.5 -1.3125 -0.5q-0.96875 0 -1.59375 0.78125q-0.625 0.78125 -0.625 1.921875l0 4.828125l-2.453125 0l0 -13.359375z" fill-rule="nonzero"/><path fill="#ffffff" d="m207.81186 256.85458l2.515625 0l0 13.359375l-2.515625 0l0 -13.359375zm5.2158203 3.84375l2.296875 0l0 1.1875l0.15625 0q0.4375 -0.6875 1.1875 -1.078125q0.765625 -0.40625 1.703125 -0.40625q1.0 0 1.75 0.484375q0.75 0.46875 1.0625 1.1875q0.46875 -0.734375 1.28125 -1.203125q0.8125 -0.46875 1.90625 -0.46875q1.625 0 2.46875 1.0q0.859375 1.0 0.859375 2.6875l0 6.125l-2.421875 0l0 -5.671875q0 -0.890625 -0.421875 -1.375q-0.421875 -0.484375 -1.171875 -0.484375q-0.953125 0 -1.53125 0.75q-0.5625 0.75 -0.5625 1.984375l0 4.796875l-2.4375 0l0 -5.671875q0 -0.890625 -0.4375 -1.375q-0.4375 -0.484375 -1.234375 -0.484375q-0.90625 0 -1.453125 0.75q-0.546875 0.75 -0.546875 1.984375l0 4.796875l-2.453125 0l0 -9.515625zm16.93335 0l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm11.463089 2.25q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm18.927567 2.109375q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm-12.203125 -9.671875l2.296875 0l0 1.265625l0.15625 0q0.375 -0.6875 1.140625 -1.109375q0.765625 -0.421875 1.640625 -0.421875q0.625 0 0.890625 0.125l0 2.375q-0.46875 -0.1875 -1.15625 -0.1875q-1.15625 0 -1.84375 0.75q-0.671875 0.75 -0.671875 2.03125l0 4.6875l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m120.3832 422.41995c9.523109 0 14.286385 -42.527557 19.046227 -85.055115c4.759842 -42.527557 9.516251 -85.05513 19.032516 -85.05513" fill-rule="evenodd"/><path stroke="#3f3f3f" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m120.3832 422.41995c9.523109 0 14.286385 -42.527557 19.046227 -85.055115c2.3799286 -21.263794 4.7589874 -42.527557 7.732605 -58.475403c0.74339294 -3.986969 1.5239563 -7.6416626 2.3509827 -10.881073c0.41352844 -1.6197205 0.83865356 -3.1355896 1.2765656 -4.5372314c0.10946655 -0.35043335 0.21974182 -0.6937256 0.33084106 -1.0296936c0.055541992 -0.16796875 0.11128235 -0.33413696 0.16725159 -0.49847412l0.0038452148 -0.011138916" fill-rule="evenodd"/><path fill="#3f3f3f" stroke="#3f3f3f" stroke-width="2.0" stroke-linecap="butt" d="m153.94038 263.90576l2.7744904 -9.251617l-8.072205 5.3037415z" fill-rule="evenodd"/><path fill="#f6921e" d="m352.0098 221.85857l0 0c0 -8.404755 6.8134155 -15.21814 15.21817 -15.21814l119.04401 0c4.0361023 0 7.906891 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4572754 6.7247467 4.4572754 10.760849l0 60.870773c0 8.404755 -6.813385 15.21817 -15.21814 15.21817l-119.04401 0c-8.404755 0 -15.21817 -6.8134155 -15.21817 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m375.99768 248.51083q-1.953125 0 -3.546875 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.90625 -1.609375 -0.90625 -3.53125q0 -1.921875 0.90625 -3.53125q0.921875 -1.609375 2.515625 -2.53125q1.59375 -0.921875 3.546875 -0.921875q1.96875 0 3.5625 0.921875q1.59375 0.921875 2.5 2.53125q0.921875 1.609375 0.921875 3.53125q0 1.921875 -0.921875 3.53125q-0.90625 1.59375 -2.5 2.515625q-1.59375 0.921875 -3.5625 0.921875zm0 -2.390625q1.234375 0 2.265625 -0.578125q1.03125 -0.578125 1.609375 -1.609375q0.59375 -1.046875 0.59375 -2.390625q0 -1.34375 -0.59375 -2.390625q-0.578125 -1.046875 -1.609375 -1.625q-1.03125 -0.578125 -2.265625 -0.578125q-1.21875 0 -2.234375 0.578125q-1.015625 0.578125 -1.609375 1.625q-0.59375 1.046875 -0.59375 2.390625q0 1.34375 0.59375 2.390625q0.59375 1.03125 1.609375 1.609375q1.015625 0.578125 2.234375 0.578125zm8.96814 -7.421875l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm10.667877 2.109375q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.141632 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm4.9742126 0l2.296875 0l0 1.1875l0.15625 0q0.4375 -0.6875 1.1875 -1.078125q0.765625 -0.40625 1.703125 -0.40625q1.0 0 1.75 0.484375q0.75 0.46875 1.0625 1.1875q0.46875 -0.734375 1.28125 -1.203125q0.8125 -0.46875 1.90625 -0.46875q1.625 0 2.46875 1.0q0.859375 1.0 0.859375 2.6875l0 6.125l-2.421875 0l0 -5.671875q0 -0.890625 -0.421875 -1.375q-0.421875 -0.484375 -1.171875 -0.484375q-0.953125 0 -1.53125 0.75q-0.5625 0.75 -0.5625 1.984375l0 4.796875l-2.4375 0l0 -5.671875q0 -0.890625 -0.4375 -1.375q-0.4375 -0.484375 -1.234375 -0.484375q-0.90625 0 -1.453125 0.75q-0.546875 0.75 -0.546875 1.984375l0 4.796875l-2.453125 0l0 -9.515625zm18.386475 -1.046875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm4.4898376 7.515625l4.96875 -5.4375l-4.8125 0l0 -2.078125l7.8125 0l0 2.0l-4.953125 5.4375l5.109375 0l0 2.078125l-8.125 0l0 -2.0zm12.75293 2.296875q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm10.9201355 1.765625q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.141632 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.44339 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm9.98761 9.515625l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m378.10632 267.44833l1.40625 1.703125l-1.765625 1.359375l-1.0 -1.1875q-1.296875 1.1875 -3.1875 1.1875q-1.375 0 -2.515625 -0.59375q-1.140625 -0.59375 -1.796875 -1.671875q-0.640625 -1.078125 -0.640625 -2.421875q0 -1.109375 0.546875 -2.109375q0.5625 -1.015625 1.546875 -1.734375q-0.796875 -1.109375 -0.796875 -2.09375q0 -0.9375 0.4375 -1.6875q0.453125 -0.765625 1.234375 -1.203125q0.796875 -0.4375 1.78125 -0.4375q1.28125 0 2.109375 0.59375q0.84375 0.59375 1.21875 1.609375l-1.96875 1.046875q-0.40625 -1.109375 -1.328125 -1.109375q-0.515625 0 -0.84375 0.328125q-0.3125 0.3125 -0.3125 0.796875q0 0.28125 0.109375 0.53125q0.125 0.25 0.359375 0.546875l4.03125 4.875l1.078125 -1.78125l1.78125 1.125l-1.484375 2.328125zm-4.421875 0.859375q1.015625 0 1.703125 -0.625l-3.359375 -4.078125q-0.515625 0.390625 -0.796875 0.953125q-0.28125 0.5625 -0.28125 1.171875q0 0.765625 0.375 1.359375q0.375 0.59375 1.0 0.90625q0.640625 0.3125 1.359375 0.3125zm6.46286 1.90625l0 0zm11.851593 0.296875q-1.953125 0 -3.5625 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.921875 -1.59375 -0.921875 -3.53125q0 -1.953125 0.921875 -3.546875q0.921875 -1.59375 2.515625 -2.515625q1.609375 -0.921875 3.5625 -0.921875q3.015625 0 5.0 2.1875l-1.78125 1.71875q-1.28125 -1.515625 -3.203125 -1.515625q-1.25 0 -2.28125 0.578125q-1.015625 0.5625 -1.609375 1.609375q-0.59375 1.03125 -0.59375 2.40625q0 1.359375 0.59375 2.40625q0.59375 1.03125 1.609375 1.609375q1.03125 0.5625 2.28125 0.5625q2.109375 0 3.53125 -1.75l1.78125 1.703125q-0.984375 1.1875 -2.34375 1.8125q-1.34375 0.625 -2.984375 0.625zm11.509491 0q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm9.739258 0l2.78125 0l2.40625 6.1875l0.15625 0l2.4375 -6.1875l2.75 0l-4.078125 9.515625l-2.421875 0l-4.03125 -9.515625zm16.034576 9.8125q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.2324524 -3.609375l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm11.2178955 9.8125q-1.734375 0 -2.8125 -0.703125q-1.0625 -0.71875 -1.5 -1.84375l2.1875 -0.953125q0.296875 0.671875 0.84375 1.015625q0.546875 0.34375 1.28125 0.34375q0.671875 0 1.109375 -0.21875q0.453125 -0.21875 0.453125 -0.703125q0 -0.46875 -0.40625 -0.6875q-0.40625 -0.234375 -1.234375 -0.421875l-1.109375 -0.25q-1.15625 -0.28125 -1.921875 -1.015625q-0.765625 -0.734375 -0.765625 -1.84375q0 -0.8125 0.484375 -1.46875q0.5 -0.65625 1.34375 -1.0q0.859375 -0.359375 1.875 -0.359375q2.953125 0 3.9375 2.0625l-2.078125 0.921875q-0.5625 -1.0 -1.796875 -1.0q-0.640625 0 -1.015625 0.234375q-0.359375 0.21875 -0.359375 0.578125q0 0.671875 1.265625 1.015625l1.390625 0.328125q1.421875 0.359375 2.140625 1.09375q0.71875 0.71875 0.71875 1.78125q0 0.90625 -0.53125 1.609375q-0.53125 0.703125 -1.453125 1.09375q-0.921875 0.390625 -2.046875 0.390625zm7.093689 -10.859375q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.443359 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.724457 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#db4437" d="m722.64044 224.63551l0 0c0 -9.399185 7.619507 -17.018723 17.018677 -17.018723l68.072815 0l0 0c4.513672 0 8.842407 1.793045 12.034058 4.98468c3.1916504 3.1916199 4.98468 7.520401 4.98468 12.034042l0 404.5453c0 9.39917 -7.619568 17.018677 -17.018738 17.018677l-68.072815 0c-9.39917 0 -17.018677 -7.619507 -17.018677 -17.018677z" fill-rule="evenodd"/><path fill="#ffffff" d="m740.59766 420.46875l4.5 0q1.109375 0 2.0625 0.515625q0.953125 0.5 1.515625 1.40625q0.578125 0.890625 0.578125 2.03125q0 0.9375 -0.421875 1.75q-0.421875 0.796875 -1.171875 1.34375q-0.734375 0.53125 -1.625 0.71875l-0.03125 0.046875l3.765625 5.46875l0 0.078125l-1.859375 0l-3.640625 -5.453125l-2.09375 0l0 5.453125l-1.578125 0l0 -13.359375zm4.40625 6.4375q0.703125 0 1.296875 -0.3125q0.609375 -0.328125 0.96875 -0.890625q0.375 -0.5625 0.375 -1.28125q0 -0.609375 -0.328125 -1.171875q-0.3125 -0.5625 -0.890625 -0.90625q-0.5625 -0.359375 -1.296875 -0.359375l-2.953125 0l0 4.921875l2.828125 0zm9.464661 7.21875q-1.71875 0 -2.625 -1.0q-0.90625 -1.015625 -0.90625 -2.828125l0 -5.984375l1.59375 0l0 5.75q0 1.359375 0.609375 2.0q0.625 0.625 1.671875 0.625q0.796875 0 1.421875 -0.421875q0.625 -0.4375 0.96875 -1.125q0.34375 -0.6875 0.34375 -1.453125l0 -5.375l1.59375 0l0 9.515625l-1.515625 0l0 -1.375l-0.078125 0q-0.390625 0.703125 -1.25 1.1875q-0.859375 0.484375 -1.828125 0.484375zm7.0130615 -9.8125l1.515625 0l0 1.40625l0.078125 0q0.390625 -0.71875 1.25 -1.203125q0.859375 -0.5 1.828125 -0.5q1.71875 0 2.609375 1.0q0.90625 1.0 0.90625 2.71875l0 6.09375l-1.578125 0l0 -5.859375q0 -1.328125 -0.640625 -1.921875q-0.625 -0.59375 -1.734375 -0.59375q-0.765625 0 -1.375 0.4375q-0.59375 0.421875 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0l0 -9.515625zm13.655701 9.671875q-0.578125 0 -1.078125 -0.1875q-0.484375 -0.1875 -0.828125 -0.515625q-0.734375 -0.703125 -0.734375 -1.953125l0 -5.578125l-1.671875 0l0 -1.4375l1.671875 0l0 -2.6875l1.578125 0l0 2.6875l2.328125 0l0 1.4375l-2.328125 0l0 5.1875q0 0.78125 0.296875 1.15625q0.359375 0.40625 1.03125 0.40625q0.578125 0 1.046875 -0.3125l0 1.546875q-0.28125 0.125 -0.578125 0.1875q-0.28125 0.0625 -0.734375 0.0625zm4.20282 -11.421875q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm-0.78125 1.75l1.578125 0l0 9.515625l-1.578125 0l0 -9.515625zm4.0600586 0l1.515625 0l0 1.40625l0.078125 0q0.40625 -0.734375 1.21875 -1.21875q0.828125 -0.484375 1.75 -0.484375q1.03125 0 1.8125 0.5q0.78125 0.5 1.109375 1.34375q0.5 -0.84375 1.328125 -1.34375q0.84375 -0.5 1.9375 -0.5q1.640625 0 2.46875 1.0q0.828125 1.0 0.828125 2.71875l0 6.09375l-1.5625 0l0 -5.859375q0 -1.328125 -0.546875 -1.921875q-0.53125 -0.59375 -1.59375 -0.59375q-0.71875 0 -1.296875 0.40625q-0.578125 0.40625 -0.90625 1.109375q-0.3125 0.6875 -0.3125 1.484375l0 5.375l-1.59375 0l0 -5.84375q0 -1.34375 -0.546875 -1.9375q-0.53125 -0.59375 -1.578125 -0.59375q-0.703125 0 -1.28125 0.421875q-0.578125 0.421875 -0.90625 1.125q-0.328125 0.6875 -0.328125 1.5l0 5.328125l-1.59375 0l0 -9.515625zm20.57721 9.8125q-1.359375 0 -2.4375 -0.640625q-1.078125 -0.65625 -1.6875 -1.796875q-0.609375 -1.15625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.53125q0.578125 -1.171875 1.625 -1.859375q1.0625 -0.6875 2.421875 -0.6875q1.390625 0 2.421875 0.625q1.03125 0.625 1.578125 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.21875 -0.03125 0.484375l-7.546875 0q0.0625 1.078125 0.53125 1.828125q0.46875 0.734375 1.1875 1.109375q0.71875 0.375 1.5 0.375q1.859375 0 2.8125 -1.71875l1.34375 0.65625q-0.59375 1.140625 -1.65625 1.828125q-1.0625 0.671875 -2.5625 0.671875zm2.75 -6.0625q-0.046875 -0.59375 -0.34375 -1.1875q-0.296875 -0.609375 -0.9375 -1.015625q-0.625 -0.40625 -1.59375 -0.40625q-1.125 0 -1.90625 0.71875q-0.78125 0.71875 -1.03125 1.890625l5.8125 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 295.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m890.1718 313.35147q-1.671875 0 -3.03125 -0.78125q-1.359375 -0.78125 -2.140625 -2.140625q-0.765625 -1.375 -0.765625 -3.0625q0 -1.671875 0.765625 -3.03125q0.78125 -1.375 2.140625 -2.15625q1.359375 -0.796875 3.03125 -0.796875q1.28125 0 2.34375 0.484375q1.0625 0.484375 1.84375 1.390625l-0.96875 0.953125q-0.65625 -0.796875 -1.4375 -1.15625q-0.78125 -0.375 -1.78125 -0.375q-1.25 0 -2.296875 0.59375q-1.03125 0.578125 -1.65625 1.640625q-0.609375 1.0625 -0.609375 2.453125q0 1.390625 0.609375 2.46875q0.625 1.0625 1.65625 1.640625q1.046875 0.578125 2.296875 0.578125q2.078125 0 3.53125 -1.703125l1.0 0.953125q-0.78125 0.953125 -1.96875 1.5q-1.171875 0.546875 -2.5625 0.546875zm6.481018 -11.703125l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm9.262878 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.21875l1.359375 0l0 7.28125q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.28125l1.359375 0l0 7.21875q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m315.48337 252.29396l36.53543 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m315.48337 252.29396l24.535461 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m340.01883 255.59743l9.076202 -3.3034668l-9.076202 -3.3034668z" fill-rule="evenodd"/><path fill="#4285f4" d="m158.21262 319.31012l0 0c0 -4.6300354 3.753357 -8.383392 8.383362 -8.383392l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-140.24115 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m195.9582 342.18634q-1.65625 0 -3.0 -0.796875q-1.34375 -0.796875 -2.109375 -2.171875q-0.765625 -1.375 -0.765625 -3.015625q0 -1.625 0.765625 -3.0q0.765625 -1.375 2.109375 -2.171875q1.34375 -0.8125 3.0 -0.8125q1.640625 0 2.984375 0.8125q1.34375 0.796875 2.109375 2.171875q0.78125 1.375 0.78125 3.0q0 1.640625 -0.78125 3.015625q-0.765625 1.375 -2.109375 2.171875q-1.34375 0.796875 -2.984375 0.796875zm0 -1.296875q1.234375 0 2.265625 -0.59375q1.03125 -0.59375 1.625 -1.65625q0.609375 -1.078125 0.609375 -2.4375q0 -1.359375 -0.609375 -2.421875q-0.59375 -1.0625 -1.625 -1.65625q-1.03125 -0.609375 -2.265625 -0.609375q-1.234375 0 -2.265625 0.609375q-1.03125 0.59375 -1.640625 1.65625q-0.59375 1.0625 -0.59375 2.421875q0 1.359375 0.59375 2.4375q0.609375 1.0625 1.640625 1.65625q1.03125 0.59375 2.265625 0.59375zm7.527771 -7.109375l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm4.6305237 0.96875l0 0zm8.196381 0.25q-1.109375 0 -2.0 -0.546875q-0.890625 -0.5625 -1.40625 -1.546875q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -1.0 1.40625 -1.5625q0.890625 -0.5625 2.0 -0.5625q0.921875 0 1.65625 0.4375q0.734375 0.421875 1.125 1.046875l0.0625 0l-0.0625 -1.140625l0 -3.375l1.359375 0l0 11.453125l-1.296875 0l0 -1.203125l-0.0625 0q-0.390625 0.609375 -1.125 1.03125q-0.734375 0.421875 -1.65625 0.421875zm0.140625 -1.21875q0.71875 0 1.328125 -0.390625q0.625 -0.390625 1.0 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.375 -0.703125 -1.0 -1.078125q-0.609375 -0.390625 -1.328125 -0.390625q-0.703125 0 -1.328125 0.390625q-0.625 0.375 -1.0 1.09375q-0.375 0.71875 -0.375 1.625q0 0.90625 0.375 1.625q0.375 0.703125 1.0 1.09375q0.625 0.390625 1.328125 0.390625zm9.567642 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm9.566757 -4.71875q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-5.75 2.734375l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l5.078125 0l0 8.15625l-1.359375 0l0 -6.921875l-3.71875 0l0 6.921875l-1.359375 0l0 -6.921875zm8.566513 -1.234375l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm9.772888 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6078796 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.21262 389.57376l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554138c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-140.24115 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m187.12544 402.04373l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.15625l-1.359375 0l0 -10.15625zm7.5230103 9.640625l-3.375 -7.640625l1.46875 0l2.5625 6.0625l0.03125 0l2.46875 -6.0625l1.46875 0l-5.03125 11.609375l-1.40625 0l1.8125 -3.96875zm5.693512 -7.640625l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm9.271149 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm2.0236359 4.9375l0 0zm8.196381 0.25q-1.109375 0 -2.0 -0.546875q-0.890625 -0.5625 -1.40625 -1.546875q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -1.0 1.40625 -1.5625q0.890625 -0.5625 2.0 -0.5625q0.921875 0 1.65625 0.4375q0.734375 0.421875 1.125 1.046875l0.0625 0l-0.0625 -1.140625l0 -3.375l1.359375 0l0 11.453125l-1.296875 0l0 -1.203125l-0.0625 0q-0.390625 0.609375 -1.125 1.03125q-0.734375 0.421875 -1.65625 0.421875zm0.140625 -1.21875q0.71875 0 1.328125 -0.390625q0.625 -0.390625 1.0 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.375 -0.703125 -1.0 -1.078125q-0.609375 -0.390625 -1.328125 -0.390625q-0.703125 0 -1.328125 0.390625q-0.625 0.375 -1.0 1.09375q-0.375 0.71875 -0.375 1.625q0 0.90625 0.375 1.625q0.375 0.703125 1.0 1.09375q0.625 0.390625 1.328125 0.390625zm9.567642 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm9.566757 -4.71875q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-5.75 2.734375l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l5.078125 0l0 8.15625l-1.359375 0l0 -6.921875l-3.71875 0l0 6.921875l-1.359375 0l0 -6.921875zm8.566513 -1.234375l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.4687653 0 2.2343903 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.54689026 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm9.772903 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6078796 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.7812195 0.859375 0.7812195 2.328125l0 5.234375l-1.3593445 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.21262 459.8374l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.753357 8.383392 -8.383362 8.383392l-140.24115 0c-4.630005 0 -8.383362 -3.7533875 -8.383362 -8.383392z" fill-rule="evenodd"/><path fill="#ffffff" d="m231.5017 482.46365l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 319.30954l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554443c1.5721741 1.5721741 2.4554443 3.7045288 2.4554443 5.9279175l0 33.5325c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m399.56876 320.98267l3.875 0q0.9375 0 1.75 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.796875 -0.359375 1.484375q-0.359375 0.6875 -1.0 1.15625q-0.625 0.46875 -1.390625 0.625l-0.03125 0.046875l3.234375 4.6875l0 0.0625l-1.609375 0l-3.109375 -4.671875l-1.796875 0l0 4.671875l-1.359375 0l0 -11.453125zm3.796875 5.515625q0.578125 0 1.09375 -0.265625q0.53125 -0.28125 0.84375 -0.765625q0.3125 -0.484375 0.3125 -1.09375q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.21875l2.4375 0zm8.778503 6.1875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm1.9730225 -3.21875l1.40625 0l1.9375 6.4375l0.015625 0l2.0625 -6.4375l1.390625 0l2.0625 6.421875l0.03125 0l1.921875 -6.421875l1.375 0l-2.625 8.15625l-1.390625 0l-2.109375 -6.515625l-2.09375 6.515625l-1.359375 0l-2.625 -8.15625zm13.338257 0l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm6.7969055 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.003998 0.125q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm2.0236511 4.9375l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m391.18372 351.6858q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm10.912415 0q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392456 -3.21875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm12.601013 8.40625q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392761 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.273743 8.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726654 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.128998 0.125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.523743 8.40625q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 389.57285l0 0c0 -4.630005 3.753357 -8.383392 8.383362 -8.383392l132.71356 0c2.2234192 0 4.355774 0.88327026 5.927948 2.4554443c1.5721741 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m366.7207 400.74594l1.359375 0l0 10.15625l5.0 0l0 1.296875l-6.359375 0l0 -11.453125zm11.436371 11.703125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm4.7197876 -7.1875l1.40625 0l1.9375 6.4375l0.015625 0l2.0625 -6.4375l1.390625 0l2.0625 6.421875l0.03125 0l1.921875 -6.421875l1.375 0l-2.625 8.15625l-1.390625 0l-2.109375 -6.515625l-2.09375 6.515625l-1.359375 0l-2.625 -8.15625zm16.803253 8.40625q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392761 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm6.796875 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.4910278 0l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm12.474121 11.875q-1.4375 0 -2.375 -0.671875q-0.9375 -0.671875 -1.265625 -1.625l1.25 -0.53125q0.265625 0.703125 0.890625 1.140625q0.640625 0.453125 1.5 0.453125q1.265625 0 1.953125 -0.734375q0.703125 -0.734375 0.703125 -2.09375l0 -0.90625l-0.0625 0q-0.40625 0.625 -1.140625 1.015625q-0.71875 0.390625 -1.640625 0.390625q-1.0625 0 -1.9375 -0.53125q-0.875 -0.546875 -1.390625 -1.515625q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -0.984375 1.390625 -1.53125q0.875 -0.546875 1.9375 -0.546875q0.921875 0 1.640625 0.40625q0.734375 0.390625 1.140625 1.03125l0.0625 0l0 -1.171875l1.296875 0l0 7.84375q0 1.953125 -1.09375 2.984375q-1.078125 1.046875 -2.859375 1.046875zm0 -4.796875q0.734375 0 1.328125 -0.359375q0.609375 -0.375 0.96875 -1.0625q0.359375 -0.6875 0.359375 -1.625q0 -0.96875 -0.359375 -1.65625q-0.359375 -0.6875 -0.96875 -1.046875q-0.59375 -0.359375 -1.328125 -0.359375q-0.734375 0 -1.34375 0.375q-0.609375 0.359375 -0.96875 1.046875q-0.359375 0.6875 -0.359375 1.640625q0 0.9375 0.359375 1.640625q0.359375 0.6875 0.96875 1.046875q0.609375 0.359375 1.34375 0.359375zm4.9582825 1.078125l0 0zm4.6495056 -8.15625l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm8.239899 1.21875q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.525391 1.1875q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm7.567993 0q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm8.42926 0q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm5.9630127 5.1875q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 459.83612l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554138c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m376.26266 478.44672l1.515625 0.375q-0.46875 1.875 -1.71875 2.859375q-1.234375 0.984375 -3.015625 0.984375q-1.859375 0 -3.015625 -0.75q-1.15625 -0.765625 -1.765625 -2.1875q-0.609375 -1.4375 -0.609375 -3.078125q0 -1.796875 0.6875 -3.125q0.6875 -1.328125 1.9375 -2.015625q1.265625 -0.703125 2.78125 -0.703125q1.71875 0 2.890625 0.875q1.171875 0.875 1.640625 2.46875l-1.5 0.34375q-0.390625 -1.25 -1.15625 -1.8125q-0.75 -0.578125 -1.90625 -0.578125q-1.3125 0 -2.203125 0.640625q-0.890625 0.625 -1.25 1.703125q-0.359375 1.0625 -0.359375 2.1875q0 1.46875 0.421875 2.5625q0.4375 1.078125 1.328125 1.625q0.90625 0.53125 1.953125 0.53125q1.265625 0 2.140625 -0.734375q0.890625 -0.734375 1.203125 -2.171875zm8.6171875 2.984375q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.6015625 4.171875l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.3671875 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.8984375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm8.9609375 -3.046875l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm8.0 2.015625q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.5703125 4.171875l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm3.5859375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm2.8046875 0l0 -1.140625l5.28125 -6.0625q-0.890625 0.046875 -1.578125 0.046875l-3.390625 0l0 -1.140625l6.78125 0l0 0.921875l-4.484375 5.265625l-0.875 0.96875q0.953125 -0.078125 1.78125 -0.078125l3.828125 0l0 1.21875l-7.34375 0zm14.15625 -1.03125q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm6.6640625 2.90625l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 530.0994l0 0c0 -4.630005 3.753357 -8.383423 8.383362 -8.383423l132.71356 0c2.2234192 0 4.355774 0.8833008 5.927948 2.4554443c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.9279785l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m384.0322 552.72565l0 -11.453125l1.515625 0l0 10.09375l5.640625 0l0 1.359375l-7.15625 0zm14.4609375 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.5703125 5.640625l1.375 0.203125q0.078125 0.640625 0.46875 0.921875q0.53125 0.390625 1.4375 0.390625q0.96875 0 1.5 -0.390625q0.53125 -0.390625 0.71875 -1.09375q0.109375 -0.421875 0.109375 -1.8125q-0.921875 1.09375 -2.296875 1.09375q-1.71875 0 -2.65625 -1.234375q-0.9375 -1.234375 -0.9375 -2.96875q0 -1.1875 0.421875 -2.1875q0.4375 -1.0 1.25 -1.546875q0.828125 -0.546875 1.921875 -0.546875q1.46875 0 2.421875 1.1875l0 -1.0l1.296875 0l0 7.171875q0 1.9375 -0.390625 2.75q-0.390625 0.8125 -1.25 1.28125q-0.859375 0.46875 -2.109375 0.46875q-1.484375 0 -2.40625 -0.671875q-0.90625 -0.671875 -0.875 -2.015625zm1.171875 -4.984375q0 1.625 0.640625 2.375q0.65625 0.75 1.625 0.75q0.96875 0 1.625 -0.734375q0.65625 -0.75 0.65625 -2.34375q0 -1.53125 -0.671875 -2.296875q-0.671875 -0.78125 -1.625 -0.78125q-0.9375 0 -1.59375 0.765625q-0.65625 0.765625 -0.65625 2.265625zm13.3984375 3.265625q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.5703125 4.171875l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm3.5859375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm2.8046875 0l0 -1.140625l5.28125 -6.0625q-0.890625 0.046875 -1.578125 0.046875l-3.390625 0l0 -1.140625l6.78125 0l0 0.921875l-4.484375 5.265625l-0.875 0.96875q0.953125 -0.078125 1.78125 -0.078125l3.828125 0l0 1.21875l-7.34375 0zm14.15625 -1.03125q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm6.6640625 2.90625l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 600.3627l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554443c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m420.3447 622.9889l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m541.56616 221.85857l0 0c0 -8.404755 6.8134155 -15.21814 15.21814 -15.21814l117.87866 0c4.036133 0 7.9069214 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4573364 6.7247467 4.4573364 10.760849l0 60.870773c0 8.404755 -6.8134155 15.21817 -15.218201 15.21817l-117.87866 0c-8.404724 0 -15.21814 -6.8134155 -15.21814 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m599.7218 248.51083q-1.953125 0 -3.5625 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.921875 -1.59375 -0.921875 -3.53125q0 -1.953125 0.921875 -3.546875q0.921875 -1.59375 2.515625 -2.515625q1.609375 -0.921875 3.5625 -0.921875q3.015625 0 5.0 2.1875l-1.78125 1.71875q-1.28125 -1.515625 -3.203125 -1.515625q-1.25 0 -2.28125 0.578125q-1.015625 0.5625 -1.609375 1.609375q-0.59375 1.03125 -0.59375 2.40625q0 1.359375 0.59375 2.40625q0.59375 1.03125 1.609375 1.609375q1.03125 0.5625 2.28125 0.5625q2.109375 0 3.53125 -1.75l1.78125 1.703125q-0.984375 1.1875 -2.34375 1.8125q-1.34375 0.625 -2.984375 0.625zm11.50946 0q-1.484314 0 -2.640564 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640564 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.312439 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59368896 0.328125 1.312439 0.328125zm11.004883 2.25q-1.28125 0 -2.34375 -0.640625q-1.046875 -0.640625 -1.65625 -1.78125q-0.609375 -1.15625 -0.609375 -2.625q0 -1.46875 0.609375 -2.625q0.609375 -1.15625 1.65625 -1.796875q1.0625 -0.640625 2.34375 -0.640625q0.96875 0 1.71875 0.421875q0.75 0.421875 1.140625 1.03125l0.140625 0l-0.140625 -1.34375l0 -3.65625l2.421875 0l0 13.359375l-2.28125 0l0 -1.140625l-0.140625 0q-0.375 0.625 -1.140625 1.03125q-0.75 0.40625 -1.71875 0.40625zm0.40625 -2.25q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.359375 0.625 -0.359375 1.453125q0 0.8125 0.359375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm11.647522 2.25q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm3.0449219 5.90625l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m572.76385 270.51083q-1.921875 0 -3.53125 -0.90625q-1.609375 -0.90625 -2.5625 -2.5q-0.953125 -1.59375 -0.953125 -3.5625q0 -1.984375 0.953125 -3.578125q0.953125 -1.59375 2.5625 -2.5q1.609375 -0.90625 3.53125 -0.90625q1.578125 0 2.875 0.546875q1.296875 0.546875 2.21875 1.5625l-1.734375 1.703125q-0.671875 -0.71875 -1.484375 -1.0625q-0.8125 -0.359375 -1.890625 -0.359375q-1.234375 0 -2.265625 0.578125q-1.015625 0.5625 -1.625 1.625q-0.609375 1.046875 -0.609375 2.390625q0 1.34375 0.609375 2.390625q0.625 1.046875 1.65625 1.625q1.046875 0.5625 2.265625 0.5625q1.875 0 3.0625 -1.09375q0.375 -0.34375 0.65625 -0.859375q0.28125 -0.53125 0.40625 -1.140625l-4.1875 0l0 -2.140625l6.546875 0q0.125 0.515625 0.125 1.171875q0 1.328125 -0.40625 2.46875q-0.390625 1.125 -1.21875 1.984375q-0.890625 0.96875 -2.15625 1.484375q-1.265625 0.515625 -2.84375 0.515625zm13.066711 0q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.232483 -3.609375l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm15.64386 9.8125q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.232422 -3.609375l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm10.589478 9.8125q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm10.920105 1.765625q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.1416626 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.443359 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m501.4901 252.29396l40.06299 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.4901 252.29396l28.062988 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m529.5531 255.59743l9.076233 -3.3034668l-9.076233 -3.3034668z" fill-rule="evenodd"/><path fill="#a64d79" d="m540.3784 322.12283l0 0c0 -4.6300354 3.753357 -8.383392 8.383362 -8.383392l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m595.36084 325.0928l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.15625l-1.359375 0l0 -10.15625zm7.392395 10.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm5.7616577 -7.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm9.178162 11.875q-1.4375 0 -2.375 -0.671875q-0.9375 -0.671875 -1.265625 -1.625l1.25 -0.53125q0.265625 0.703125 0.890625 1.140625q0.640625 0.453125 1.5 0.453125q1.265625 0 1.953125 -0.734375q0.703125 -0.734375 0.703125 -2.09375l0 -0.90625l-0.0625 0q-0.40625 0.625 -1.140625 1.015625q-0.71875 0.390625 -1.640625 0.390625q-1.0625 0 -1.9375 -0.53125q-0.875 -0.546875 -1.390625 -1.515625q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -0.984375 1.390625 -1.53125q0.875 -0.546875 1.9375 -0.546875q0.921875 0 1.640625 0.40625q0.734375 0.390625 1.140625 1.03125l0.0625 0l0 -1.171875l1.296875 0l0 7.84375q0 1.953125 -1.09375 2.984375q-1.078125 1.046875 -2.859375 1.046875zm0 -4.796875q0.734375 0 1.328125 -0.359375q0.609375 -0.375 0.96875 -1.0625q0.359375 -0.6875 0.359375 -1.625q0 -0.96875 -0.359375 -1.65625q-0.359375 -0.6875 -0.96875 -1.046875q-0.59375 -0.359375 -1.328125 -0.359375q-0.734375 0 -1.34375 0.375q-0.609375 0.359375 -0.96875 1.046875q-0.359375 0.6875 -0.359375 1.640625q0 0.9375 0.359375 1.640625q0.359375 0.6875 0.96875 1.046875q0.609375 0.359375 1.34375 0.359375zm9.51886 1.328125q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm5.9242554 5.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm1.7153931 -0.125l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m570.7174 354.49905q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6279297 -7.1875l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm8.451111 1.09375q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6079102 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.4910278 0l1.28125 0l0 1.203125l0.078125 0q0.34375 -0.625 1.046875 -1.046875q0.703125 -0.421875 1.484375 -0.421875q0.90625 0 1.5625 0.4375q0.671875 0.4375 0.953125 1.15625q0.4375 -0.71875 1.140625 -1.15625q0.71875 -0.4375 1.65625 -0.4375q1.40625 0 2.125 0.859375q0.71875 0.859375 0.71875 2.328125l0 5.234375l-1.34375 0l0 -5.03125q0 -1.125 -0.46875 -1.640625q-0.46875 -0.515625 -1.375 -0.515625q-0.609375 0 -1.109375 0.359375q-0.5 0.34375 -0.78125 0.9375q-0.265625 0.59375 -0.265625 1.28125l0 4.609375l-1.359375 0l0 -5.015625q0 -1.140625 -0.46875 -1.65625q-0.46875 -0.515625 -1.359375 -0.515625q-0.609375 0 -1.109375 0.359375q-0.5 0.359375 -0.78125 0.96875q-0.265625 0.59375 -0.265625 1.28125l0 4.578125l-1.359375 0l0 -8.15625zm14.812866 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.0691528 6.984375l4.8125 -5.75l-4.671875 0l0 -1.234375l6.34375 0l0 1.171875l-4.828125 5.75l4.953125 0l0 1.234375l-6.609375 0l0 -1.171875zm10.647644 1.421875q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726624 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6079102 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm11.739746 8.40625q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 392.38617l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554138c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m564.0176 415.26242q-1.65625 0 -3.0 -0.796875q-1.34375 -0.796875 -2.109375 -2.171875q-0.765625 -1.375 -0.765625 -3.015625q0 -1.625 0.765625 -3.0q0.765625 -1.375 2.109375 -2.171875q1.34375 -0.8125 3.0 -0.8125q1.640625 0 2.984375 0.8125q1.34375 0.796875 2.109375 2.171875q0.78125 1.375 0.78125 3.0q0 1.640625 -0.78125 3.015625q-0.765625 1.375 -2.109375 2.171875q-1.34375 0.796875 -2.984375 0.796875zm0 -1.296875q1.234375 0 2.265625 -0.59375q1.03125 -0.59375 1.625 -1.65625q0.609375 -1.078125 0.609375 -2.4375q0 -1.359375 -0.609375 -2.421875q-0.59375 -1.0625 -1.625 -1.65625q-1.03125 -0.609375 -2.265625 -0.609375q-1.234375 0 -2.265625 0.609375q-1.03125 0.59375 -1.640625 1.65625q-0.59375 1.0625 -0.59375 2.421875q0 1.359375 0.59375 2.4375q0.609375 1.0625 1.640625 1.65625q1.03125 0.59375 2.265625 0.59375zm7.527771 -7.109375l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm9.271118 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392456 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.273804 8.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726624 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.1290283 0.125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6259766 -7.1875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm4.984436 8.15625l0 0zm5.345093 -6.921875l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l2.09375 0l0 1.234375l-2.09375 0l0 6.921875l-1.359375 0l0 -6.921875zm7.9196167 7.171875q-1.46875 0 -2.25 -0.859375q-0.78125 -0.859375 -0.78125 -2.421875l0 -5.125l1.359375 0l0 4.921875q0 1.171875 0.53125 1.71875q0.53125 0.546875 1.421875 0.546875q0.6875 0 1.21875 -0.375q0.546875 -0.375 0.84375 -0.953125q0.296875 -0.59375 0.296875 -1.25l0 -4.609375l1.359375 0l0 8.15625l-1.296875 0l0 -1.1875l-0.0625 0q-0.34375 0.609375 -1.078125 1.03125q-0.734375 0.40625 -1.5625 0.40625zm8.786682 0q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm5.441101 -9.90625q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 462.64954l0 0c0 -4.630005 3.753357 -8.383392 8.383362 -8.383392l131.54822 0c2.2233887 0 4.355774 0.88327026 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m558.7859 485.27576l0 -11.453125l1.515625 0l0 5.671875l5.6875 -5.671875l2.0625 0l-4.8125 4.640625l5.015625 6.8125l-2.0 0l-4.078125 -5.796875l-1.875 1.828125l0 3.96875l-1.515625 0zm16.234375 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8203125 4.953125l0 -8.296875l1.265625 0l0 1.25q0.484375 -0.875 0.890625 -1.15625q0.40625 -0.28125 0.90625 -0.28125q0.703125 0 1.4375 0.453125l-0.484375 1.296875q-0.515625 -0.296875 -1.03125 -0.296875q-0.453125 0 -0.828125 0.28125q-0.359375 0.265625 -0.515625 0.765625q-0.234375 0.75 -0.234375 1.640625l0 4.34375l-1.40625 0zm5.34375 0l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm14.5703125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm7.46875 -2.484375l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm14.234375 -0.1875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm9.2578125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm13.2421875 1.90625l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 532.9129l0 0c0 -4.630005 3.753357 -8.383423 8.383362 -8.383423l131.54822 0c2.2233887 0 4.355774 0.8833008 5.9279175 2.4554443c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.9279785l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m579.12964 546.0391l0 -11.453125l1.515625 0l0 11.453125l-1.515625 0zm4.0078125 0l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.3359375 -2.484375l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm11.625 1.21875l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3671875 1.265625l0 -8.296875l1.265625 0l0 1.25q0.484375 -0.875 0.890625 -1.15625q0.40625 -0.28125 0.90625 -0.28125q0.703125 0 1.4375 0.453125l-0.484375 1.296875q-0.515625 -0.296875 -1.03125 -0.296875q-0.453125 0 -0.828125 0.28125q-0.359375 0.265625 -0.515625 0.765625q-0.234375 0.75 -0.234375 1.640625l0 4.34375l-1.40625 0zm10.78125 0l0 -1.21875q-0.96875 1.40625 -2.640625 1.40625q-0.734375 0 -1.375 -0.28125q-0.625 -0.28125 -0.9375 -0.703125q-0.3125 -0.4375 -0.4375 -1.046875q-0.078125 -0.421875 -0.078125 -1.3125l0 -5.140625l1.40625 0l0 4.59375q0 1.109375 0.078125 1.484375q0.140625 0.5625 0.5625 0.875q0.4375 0.3125 1.0625 0.3125q0.640625 0 1.1875 -0.3125q0.5625 -0.328125 0.78125 -0.890625q0.234375 -0.5625 0.234375 -1.625l0 -4.4375l1.40625 0l0 8.296875l-1.25 0zm8.8671875 -3.046875l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m583.4617 562.55475l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm14.234375 -0.1875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm9.2578125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm13.2421875 1.90625l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 603.1762l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554443c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.753357 8.383423 -8.383362 8.383423l-131.54822 0c-4.630005 0 -8.383362 -3.753418 -8.383362 -8.383423z" fill-rule="evenodd"/><path fill="#ffffff" d="m609.32104 625.8025l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m689.88116 252.29396c8.189758 0 12.285095 43.65355 16.379578 87.30708c4.0944824 43.653564 8.188171 87.3071 16.376282 87.3071" fill-rule="evenodd"/><path stroke="#3f3f3f" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m689.88116 252.29396c8.189758 0 12.285095 43.65355 16.379578 87.30708c2.0472412 21.826782 4.0942383 43.653564 6.653076 60.02362c0.63964844 4.0925293 1.3113403 7.843994 2.0229492 11.169189c0.35577393 1.6625977 0.7216797 3.218567 1.0985107 4.6573486c0.094177246 0.3597107 0.18902588 0.71206665 0.28466797 1.0569458l0.04724121 0.16818237" fill-rule="evenodd"/><path fill="#3f3f3f" stroke="#3f3f3f" stroke-width="2.0" stroke-linecap="butt" d="m713.55054 418.40237l7.558899 6.012787l-1.9254761 -9.464813z" fill-rule="evenodd"/><path fill="#254a89" d="m152.4042 148.53035l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l553.4695 0c2.2233887 0 4.355713 0.88323975 5.9279175 2.455429c1.5722046 1.5721893 2.4554443 3.7045288 2.4554443 5.9279327l0 33.53247c0 4.630005 -3.753357 8.383377 -8.383362 8.383377l-553.4695 0c-4.630005 0 -8.383362 -3.7533722 -8.383362 -8.383377z" fill-rule="evenodd"/><path fill="#ffffff" d="m421.15555 159.70346l1.96875 0l3.640625 9.390625l0.0625 0l3.65625 -9.390625l1.96875 0l0 11.453125l-1.34375 0l0 -7.28125l0.0625 -2.171875l-0.0625 0l-3.75 9.453125l-1.109375 0l-3.75 -9.453125l-0.0625 0l0.0625 2.171875l0 7.28125l-1.34375 0l0 -11.453125zm13.856049 0l1.359375 0l0 10.15625l5.0 0l0 1.296875l-6.359375 0l0 -11.453125zm8.143982 0l1.359375 0l0 11.453125l-1.359375 0l0 -11.453125zm3.920044 0l3.875 0q0.9375 0 1.75 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.796875 -0.359375 1.484375q-0.359375 0.6875 -1.0 1.15625q-0.625 0.46875 -1.390625 0.625l-0.03125 0.046875l3.234375 4.6875l0 0.0625l-1.609375 0l-3.109375 -4.671875l-1.796875 0l0 4.671875l-1.359375 0l0 -11.453125zm3.796875 5.515625q0.578125 0 1.09375 -0.265625q0.53125 -0.28125 0.84375 -0.765625q0.3125 -0.484375 0.3125 -1.09375q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.21875l2.4375 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 343.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m889.5958 361.35147q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm7.6329956 -11.703125l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm9.262878 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.21875l1.359375 0l0 7.28125q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.28125l1.359375 0l0 7.21875q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 391.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m879.9968 397.64835l6.625 0l0 1.296875l-5.265625 0l0 3.921875l4.75 0l0 1.28125l-4.75 0l0 4.953125l-1.359375 0l0 -11.453125zm8.495972 0l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm10.540405 6.21875q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm10.785522 -11.703125l1.53125 0l4.359375 11.453125l-1.5 0l-1.140625 -3.15625l-4.96875 0l-1.140625 3.15625l-1.5 0l4.359375 -11.453125zm2.78125 7.015625l-1.53125 -4.125l-0.453125 -1.234375l-0.0625 0l-0.453125 1.234375l-1.53125 4.125l4.03125 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 447.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m895.46985 465.19522q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875zm3.8079834 0q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875zm3.8080444 0q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 503.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482666c0 3.3804932 -2.7403564 6.1209106 -6.1208496 6.1209106l-100.010254 0c-3.3804321 0 -6.1208496 -2.7404175 -6.1208496 -6.1209106z" fill-rule="evenodd"/><path fill="#ffffff" d="m888.7646 510.94522l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.1562195l-1.359375 0l0 -10.1562195zm6.264221 -1.296875l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.7655945 0.484375 1.7499695q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.4530945zm3.890625 5.4843445q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.48434448 -0.75 -0.7812195q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1874695l2.53125 0zm9.262939 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.2187195l1.359375 0l0 7.2812195q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.2812195l1.359375 0l0 7.2187195q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/g3doc/overview.md b/tensorflow/compiler/mlir/g3doc/overview.md
new file mode 100644
index 00000000000..4cf99ba3800
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/overview.md
@@ -0,0 +1,36 @@
+# MLIR
+
+## Overview
+
+MLIR, or Multi-Level Intermediate Representation, is a representation format
+and library of compiler utilities that sits between the model representation
+and low-level compilers/executors that generate hardware-specific code.
+
+MLIR is, at its heart, a flexible infrastructure for modern optimizing
+compilers. This means it consists of a specification for intermediate
+representations (IR) and a code toolkit to perform transformations on that
+representation. (In compiler parlance, as you move from higher-level
+representations to lower-level representations, these transformations can be
+called “lowerings”)
+
+MLIR is highly influenced by [LLVM](https://llvm.org/) and unabashedly reuses
+many great ideas from it. It has a flexible type system, and allows
+representing, analyzing and transforming graphs combining multiple levels of
+abstraction in the same compilation unit. These abstractions include TensorFlow
+operations, nested polyhedral loop regions, and even LLVM instructions and fixed
+hardware operations and types.
+
+We expect MLIR to be of interest to many groups, including:
+
+*   Compiler researchers and implementers looking to optimize performance and
+    memory consumption of machine learning models
+*   Hardware makers looking for a way to connect their hardware to TensorFlow,
+    such as TPUs, portable neural hardware in phones, and other custom ASICs
+*   People writing language bindings that want to take advantage of optimizing
+    compilers and hardware acceleration.
+
+The TensorFlow ecosystem contains a number of compilers and optimizers that
+operate at multiple levels of the software and hardware stack. We expect the
+gradual adoption of MLIR to simplify every aspect of this stack.
+
+<img alt="MLIR overview diagram" src="./images/mlir-infra.svg"/>
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8d51dd3cfc2..5af7ff2c207 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -602,6 +602,7 @@ tf_cc_binary(
     name = "flatbuffer_translate",
     deps = [
         ":flatbuffer_translate_lib",
+        "@llvm-project//mlir:LoopOpsTransforms",
         "@llvm-project//mlir:MlirTranslateMain",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 73c21ea8ad0..6753ab9e728 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -46,7 +46,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/QuantOps/QuantOps.h"  // TF:llvm-project
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Diagnostics.h"  // TF:llvm-project
@@ -76,6 +76,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -124,6 +125,20 @@ static opt<bool, true> experimental_prune_unreachable_nodes_unconditionally_flg(
     llvm::cl::location(experimental_prune_unreachable_nodes_unconditionally),
     llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+static opt<std::string> input_arrays_flag(
+    "input-arrays",
+    llvm::cl::desc(
+        "List of input tensors, if different from the default inputs"),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+static opt<std::string> output_arrays_flag(
+    "output-arrays",
+    llvm::cl::desc(
+        "List of output tensors, if different from the default outputs"),
+    llvm::cl::init(""));
+
 namespace {
 bool IsScalar(const TensorT& tensor) {
   // TODO(b/138222071) We can't distinguish scalars and unranked tensors
@@ -590,6 +605,11 @@ StatusOr<Operation*> ConvertOp(
     op_state.addTypes({type});
   }
 
+  if (op_name == "tfl.lstm") {
+    // TODO(b/147587779): add the right region if region is empty.
+    op_state.addRegion();
+  }
+
   llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
   if (IsCustomOp(op_name)) {
     auto status = mlir::CustomOptionsToAttributes(op_name, op.custom_options,
@@ -610,43 +630,30 @@ StatusOr<Operation*> ConvertOp(
   return builder.createOperation(op_state);
 }
 
-// Returns the output tensor indices for the given subgraph. If
-// ordered_output_arrays is provided, then return the tensor indices in
-// ordered_output_arrays.
-StatusOr<llvm::SmallVector<int32_t, 4>> GetOutputTensorIndices(
-    const tflite::SubGraphT& subgraph, Location base_loc,
-    const std::vector<std::string>& ordered_output_arrays) {
-  if (ordered_output_arrays.empty()) {
-    return llvm::SmallVector<int32_t, 4>(subgraph.outputs.begin(),
-                                         subgraph.outputs.end());
+// Returns indices of the given tensors in the subgraph. Returns error if a
+// tensor name cannot be found in the subgraph.
+StatusOr<std::vector<int>> GetTensorIndices(
+    const tflite::SubGraphT& subgraph,
+    const std::vector<std::string>& tensor_names) {
+  absl::flat_hash_map<std::string, int> name_to_index;
+  for (auto index_and_tensor : llvm::enumerate(subgraph.tensors)) {
+    name_to_index[index_and_tensor.value()->name] = index_and_tensor.index();
   }
 
-  llvm::SmallVector<int32_t, 4> outputs;
-  outputs.resize(ordered_output_arrays.size());
-  absl::flat_hash_map<std::string, int> output_order_map;
-  for (auto output : llvm::enumerate(ordered_output_arrays)) {
-    output_order_map[output.value()] = output.index();
-  }
+  std::vector<int> indices;
+  indices.reserve(tensor_names.size());
 
-  int tensor_index = 0;
-  int found_output_tensors = 0;
-  for (const auto& tensor : subgraph.tensors) {
-    auto found = output_order_map.find(tensor->name);
-    if (found != output_order_map.end()) {
-      const int output_index = found->second;
-      outputs[output_index] = tensor_index;
-      ++found_output_tensors;
+  for (const auto& name : tensor_names) {
+    auto found = name_to_index.find(name);
+    if (found != name_to_index.end()) {
+      indices.push_back(found->second);
+    } else {
+      return errors::InvalidArgument("could not find tensor in subgraph: ",
+                                     name);
     }
-    ++tensor_index;
   }
 
-  if (found_output_tensors != ordered_output_arrays.size()) {
-    auto err = errors::InvalidArgument(
-        "cannot find all nodes in ordered_output_arrays");
-    return emitError(base_loc, err.ToString()), err;
-  }
-
-  return outputs;
+  return indices;
 }
 
 // Given a list of tensor indices, returns a string of concatenated tensor names
@@ -661,15 +668,18 @@ mlir::NamedAttribute BuildTFEntryFunctionAttribute(
       name, builder->getStringAttr(llvm::join(tensor_names, ",")));
 }
 
-// Given a list of output indices, traverses the subgraph and returns the set of
-// ops that are ancestors of the output tensors.
+// Traverses the subgraph from output_indices to input_indices and returns the
+// set of ops that are visited.
 StatusOr<absl::flat_hash_set<const tflite::OperatorT*>> PruneSubgraph(
-    const tflite::SubGraphT& subgraph, ArrayRef<int32_t> output_indices) {
+    const tflite::SubGraphT& subgraph, ArrayRef<int32_t> input_indices,
+    ArrayRef<int32_t> output_indices) {
   // Create a map from tensor index to defining op.
   absl::flat_hash_map<int32_t, const tflite::OperatorT*> defining_op;
   for (const auto& op : subgraph.operators) {
     for (int32_t output : op->outputs) {
-      defining_op[output] = op.get();
+      if (!llvm::is_contained(input_indices, output)) {
+        defining_op[output] = op.get();
+      }
     }
   }
 
@@ -718,18 +728,40 @@ StatusOr<FuncOp> ConvertSubgraph(
     const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
-    Location base_loc, Builder builder,
-    const std::vector<std::string>& ordered_output_arrays, bool is_entry_point,
+    Location base_loc, Builder builder, bool is_entry_point,
     bool use_external_constant,
+    const std::vector<std::string>& ordered_input_arrays,
+    const std::vector<std::string>& ordered_output_arrays,
     bool experimental_prune_unreachable_nodes_unconditionally) {
   llvm::SmallVector<mlir::Type, 2> ret_types;
   llvm::SmallVector<mlir::Type, 4> input_types;
 
   auto func_loc = mlir::NameLoc::get(builder.getIdentifier(name), base_loc);
 
-  // Construct function type
-  for (auto input : subgraph.inputs) {
-    auto& tensor = *subgraph.tensors.at(input);
+  std::vector<int> func_inputs = subgraph.inputs;
+  if (is_entry_point && !ordered_input_arrays.empty()) {
+    if (!experimental_prune_unreachable_nodes_unconditionally) {
+      // TODO(b/149922113): Resolve input-arrays/pruning flags interaction.
+      return errors::InvalidArgument(
+          "input-arrays should be used with experimental pruning flag");
+    }
+    TF_ASSIGN_OR_RETURN(func_inputs,
+                        GetTensorIndices(subgraph, ordered_input_arrays));
+  }
+
+  // Add state variables to inputs.
+  absl::flat_hash_set<int32_t> input_index_set(func_inputs.begin(),
+                                               func_inputs.end());
+  for (int i = 0; i < subgraph.tensors.size(); i++) {
+    auto& tensor = *subgraph.tensors.at(i);
+    if (tensor.is_variable && !input_index_set.contains(i)) {
+      func_inputs.emplace_back(i);
+      input_index_set.insert(i);
+    }
+  }
+
+  for (auto input_or_variable : func_inputs) {
+    auto& tensor = *subgraph.tensors.at(input_or_variable);
     // TODO(b/138222071) Graph inputs must have static shape per the exporter,
     // but we cannot differentiate scalars from unranked tensors.
     // Here we reverse the default assumption that shape = [] means unranked.
@@ -753,9 +785,11 @@ StatusOr<FuncOp> ConvertSubgraph(
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
-      auto func_outputs,
-      GetOutputTensorIndices(subgraph, base_loc, ordered_output_arrays));
+  std::vector<int> func_outputs = subgraph.outputs;
+  if (is_entry_point && !ordered_output_arrays.empty()) {
+    TF_ASSIGN_OR_RETURN(func_outputs,
+                        GetTensorIndices(subgraph, ordered_output_arrays));
+  }
 
   for (auto output : func_outputs) {
     bool is_constant = !is_op_output[output];
@@ -782,8 +816,8 @@ StatusOr<FuncOp> ConvertSubgraph(
   Value maybe_optional_arg_marker = nullptr;
 
   // Get or construct MLIR values for each input
-  for (int i = 0, e = subgraph.inputs.size(); i < e; i++) {
-    auto input_tensor = subgraph.inputs[i];
+  for (int i = 0, e = func_inputs.size(); i < e; i++) {
+    auto input_tensor = func_inputs[i];
     const auto& tensor = *subgraph.tensors.at(input_tensor);
     auto loc = TensorLoc(tensor, builder, base_loc);
     if (vals_map[input_tensor]) {
@@ -806,9 +840,9 @@ StatusOr<FuncOp> ConvertSubgraph(
   // Set tf.entry_function attribute
   if (is_entry_point) {
     llvm::SmallVector<mlir::NamedAttribute, 2> attributes;
-    if (!subgraph.inputs.empty()) {
+    if (!func_inputs.empty()) {
       attributes.push_back(BuildTFEntryFunctionAttribute(
-          subgraph, &builder, "inputs", subgraph.inputs));
+          subgraph, &builder, "inputs", func_inputs));
     }
     if (!func_outputs.empty()) {
       attributes.push_back(BuildTFEntryFunctionAttribute(
@@ -820,7 +854,7 @@ StatusOr<FuncOp> ConvertSubgraph(
   absl::flat_hash_set<const tflite::OperatorT*> pruned_subgraph_ops;
   if (experimental_prune_unreachable_nodes_unconditionally) {
     TF_ASSIGN_OR_RETURN(pruned_subgraph_ops,
-                        PruneSubgraph(subgraph, func_outputs));
+                        PruneSubgraph(subgraph, func_inputs, func_outputs));
   }
 
   // Construct MLIR operators from TFLite operators
@@ -931,8 +965,9 @@ std::string SubgraphName(unsigned index, const tflite::SubGraphT& subgraph) {
 
 OwningModuleRef tflite::FlatBufferToMlir(
     absl::string_view buffer, MLIRContext* context, Location base_loc,
-    const std::vector<std::string>& ordered_output_arrays,
     bool use_external_constant,
+    const std::vector<std::string>& ordered_input_arrays,
+    const std::vector<std::string>& ordered_output_arrays,
     bool experimental_prune_unreachable_nodes_unconditionally) {
   auto model_ptr =
       FlatBufferModel::VerifyAndBuildFromBuffer(buffer.data(), buffer.length());
@@ -971,33 +1006,25 @@ OwningModuleRef tflite::FlatBufferToMlir(
                    builder.getStringAttr(model->description));
   }
 
-  if (!ordered_output_arrays.empty() && model->subgraphs.size() > 1) {
-    // TODO(b/141485522): support more than one subgraph.
-    return emitError(base_loc,
-                     "ordered_output_arrays does not support more than one "
-                     "subgraph yet"),
-           nullptr;
-  }
-
   for (auto e : llvm::enumerate(model->subgraphs)) {
     auto& subgraph = e.value();
     std::string name = SubgraphName(e.index(), *subgraph);
     auto func_or_error = ConvertSubgraph(
         *subgraph, name, operator_names, func_names, model->buffers, base_loc,
-        // Only the entry point needs pseudo_input_ops
+        builder,
         // TODO(b/131175224,b/132239787) Support multiple entry points
-        builder, ordered_output_arrays,
         /*is_entry_point=*/e.index() == 0,
-        /*use_external_constant=*/use_external_constant,
+        /*use_external_constant=*/use_external_constant, ordered_input_arrays,
+        ordered_output_arrays,
         experimental_prune_unreachable_nodes_unconditionally);
     if (!func_or_error.ok()) {
       return emitError(base_loc, "could not translate function ")
-                 << subgraph->name,
+                 << subgraph->name << ": "
+                 << func_or_error.status().error_message(),
              nullptr;
     }
     module.push_back(func_or_error.ConsumeValueOrDie());
   }
-  // TFLite subgraphs do not necessarily have names,
 
   return OwningModuleRef(module);
 }
@@ -1012,17 +1039,24 @@ static OwningModuleRef FlatBufferFileToMlirTrans(
   auto loc =
       mlir::FileLineColLoc::get(input->getBufferIdentifier(), 0, 0, context);
 
-  // Parses output_arrays_order from command line option.
+  // Parses input/output names from command line options.
+  std::vector<std::string> inputs;
   std::vector<std::string> outputs;
-  if (!tensorflow::ParseOutputArrayInfo(output_arrays_string, &outputs).ok()) {
+  // Use output parser since we only have tensor names.
+  if (!tensorflow::ParseOutputArrayInfo(input_arrays_flag, &inputs).ok()) {
+    return emitError(loc, "parsing input array info failed ")
+               << input_arrays_flag,
+           nullptr;
+  }
+  if (!tensorflow::ParseOutputArrayInfo(output_arrays_flag, &outputs).ok()) {
     return emitError(loc, "parsing output array info failed ")
-               << output_arrays_string,
+               << output_arrays_flag,
            nullptr;
   }
 
   return tflite::FlatBufferToMlir(
       absl::string_view(input->getBufferStart(), input->getBufferSize()),
-      context, loc, outputs, use_external_constant,
+      context, loc, use_external_constant, inputs, outputs,
       experimental_prune_unreachable_nodes_unconditionally);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.h b/tensorflow/compiler/mlir/lite/flatbuffer_import.h
index e3210c6d03f..5dba9a0efc4 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.h
@@ -35,9 +35,9 @@ namespace tflite {
 // are not ancestors of the output nodes will be pruned.
 mlir::OwningModuleRef FlatBufferToMlir(
     absl::string_view buffer, mlir::MLIRContext* context,
-    mlir::Location base_loc,
-    const std::vector<std::string>& ordered_output_arrays,
-    bool use_external_constant = false,
+    mlir::Location base_loc, bool use_external_constant = false,
+    const std::vector<std::string>& ordered_input_arrays = {},
+    const std::vector<std::string>& ordered_output_arrays = {},
     bool experimental_prune_unreachable_nodes_unconditionally = false);
 }  // namespace tflite
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index 13e638fae66..ac20ab68eaa 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
@@ -122,8 +122,6 @@ bool emit_custom_ops;
 bool emit_select_tf_ops;
 bool lower_tensor_list_ops;
 bool strip_debug_info;
-// NOLINTNEXTLINE
-std::string output_arrays_string;
 
 // NOLINTNEXTLINE
 static opt<bool, true> emit_builtin_tflite_ops_flag(
@@ -156,11 +154,6 @@ static opt<bool, true> strip_debug_info_flag(
     "strip-debug-info", llvm::cl::desc("Strip debug info during export"),
     llvm::cl::location(strip_debug_info), llvm::cl::init(false));
 
-// NOLINTNEXTLINE
-static opt<std::string, true> output_arrays_flag(
-    "output-arrays", llvm::cl::desc("List of output tensors"),
-    llvm::cl::location(output_arrays_string), llvm::cl::init(""));
-
 ABSL_CONST_INIT const absl::string_view kFlexOpNamePrefix = "Flex";
 
 // Use initial buffer size in flatbuffer builder to be same as the initial size
@@ -172,7 +165,7 @@ constexpr size_t kInitialBufferSize = 10240;
 // `isSigned` is set to false for other types.
 static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
                                                   bool is_signed = true) {
-  if (!is_signed && type.isInteger(8)) {
+  if (!is_signed && type.isSignlessInteger(8)) {
     return tflite::TensorType_UINT8;
   }
   if (!is_signed) {
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h b/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h
index 71567e1d05e..6c8f80d4e05 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate_flags.h
@@ -27,7 +27,5 @@ extern bool emit_custom_ops;
 extern bool lower_tensor_list_ops;
 // The flag to control whether debug info gets stripped on export.
 extern bool strip_debug_info;
-// The flag to control the output array info of tflite graph.
-extern std::string output_arrays_string;
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_TRANSLATE_FLAGS_H_
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index be70d20dc12..83e372e5732 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Matchers.h"  // TF:llvm-project
@@ -275,7 +275,7 @@ Attribute ConstFoldBinaryOp(
     return ConstFoldBinaryOp<FloatAttr>(result_type, operands[0], operands[1],
                                         float_calculate, is_commutative);
 
-  if (elemType.isa<IntegerType>())
+  if (elemType.isSignlessInteger())
     return ConstFoldBinaryOp<IntegerAttr>(result_type, operands[0], operands[1],
                                           int_calculate, is_commutative);
 
@@ -723,12 +723,11 @@ static LogicalResult Verify(PackOp op) {
   }
 
   // Make sure all inputs have the same shape and element type.
-  // TODO(rahulsp): Simplify once b/135032064 is fixed.
-  for (Value operand : op.getOperands()) {
-    auto other_type = operand.getType().cast<ShapedType>();
-    if (input_type != other_type)
+  // TODO(b/135032063): Simplify once fixed.
+  for (Type operand_type : op.getOperandTypes()) {
+    if (failed(mlir::verifyCompatibleShape(input_type, operand_type)))
       return op.emitOpError("operands should be of the same type. got ")
-             << input_type << ", " << other_type;
+             << input_type << ", " << operand_type;
   }
 
   return success();
@@ -1561,7 +1560,7 @@ OpFoldResult RangeOp::fold(ArrayRef<Attribute> operands) {
            limit_tensor.getType().getRank() == 0 &&
            delta_tensor.getType().getRank() == 0);
     Type elem_type = getType().cast<ShapedType>().getElementType();
-    if (elem_type.isa<IntegerType>()) {
+    if (elem_type.isSignlessInteger()) {
       auto start_attr = start_tensor.getValue<IntegerAttr>({});
       auto limit_attr = limit_tensor.getValue<IntegerAttr>({});
       auto delta_attr = delta_tensor.getValue<IntegerAttr>({});
@@ -1663,7 +1662,7 @@ OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
 
   // Do not try to fold elements attr of a quant type because
   // DenseElementsAttr does not support it.
-  if (!getType().cast<ShapedType>().getElementType().isIntOrFloat())
+  if (!getType().cast<ShapedType>().getElementType().isSignlessIntOrFloat())
     return nullptr;
 
   assert(perm_tensor.getType().getRank() == 1);
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index a04e1d44ea6..36a1e93dc26 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1656,7 +1656,7 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
   let hasOptions = 0;
 }
 
-def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> {
   let summary = "Mean operator";
 
   let description = [{
@@ -2482,11 +2482,11 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$input,
     TFL_I32OrI64Tensor:$multiples);
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$output);
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$output);
 
   let hasOptions = 0;
 }
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index f2b89aebb44..8ad448e5d83 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -63,6 +63,41 @@ const char kDetectionPostProcessOp[] =
     "'detections_per_class' type: 'int' default_value { i : 100 }} attr { "
     "name: 'use_regular_nms' type: 'bool' default_value { b : false }}";
 
+const char kUnidirectionalSequenceLstmOp[] =
+    "name: 'UnidirectionalSequenceLstm' input_arg: {name: 'Input' type: "
+    "DT_FLOAT} input_arg: { name: 'InputToInputWeights' type: DT_FLOAT } "
+    "input_arg: { name: 'InputToForgetWeights' type: DT_FLOAT } input_arg: { "
+    "name: 'InputToCellWeights' type: DT_FLOAT} input_arg: { name: "
+    "'InputToOutputWeights' type: DT_FLOAT } input_arg: { name: "
+    "'RecurrentToInputWeights' type: DT_FLOAT} input_arg: { name: "
+    "'RecurrentToForgetWeights' type: DT_FLOAT} input_arg: { name: "
+    "'RecurrentToCellWeights' type: DT_FLOAT } input_arg: { name: "
+    "'RecurrentToOutputWeights' type: DT_FLOAT } input_arg: { name: "
+    "'CellToInputWeights' type: DT_FLOAT} input_arg: { name: "
+    "'CellToForgetWeights' type: DT_FLOAT } input_arg: { name: "
+    "'CellToOutputWeights' type: DT_FLOAT } input_arg: { name: 'InputGateBias' "
+    "type: DT_FLOAT } input_arg: { name: 'ForgetGateBias' type: DT_FLOAT } "
+    "input_arg: { name: 'kCellGateBias' type: DT_FLOAT } input_arg: { name: "
+    "'OutputGateBias' type: DT_FLOAT } input_arg: { name: 'ProjectionWeights' "
+    "type: DT_FLOAT } input_arg: { name: 'ProjectionBias' type: DT_FLOAT } "
+    "input_arg: { name: 'InputActivationState' type: DT_FLOAT} input_arg: { "
+    "name: 'InputCellStateTensor' type: DT_FLOAT } "
+    "output_arg: { name: 'Concat' type: DT_FLOAT} "
+    "output_arg: { name: "
+    "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: DT_FLOAT} "
+    "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
+
+const char kUnidirectionalSequenceRnnOp[] =
+    "name: 'UnidirectionalSequenceRnn' input_arg: {name: 'Input' type: "
+    "DT_FLOAT} input_arg: { name: 'Weights' type: DT_FLOAT } "
+    "input_arg: { name: 'RecurrentWeights' type: DT_FLOAT } input_arg: { "
+    "name: 'Bias' type: DT_FLOAT} "
+    "input_arg: { name: 'HiddenState' type: DT_FLOAT} "
+    "output_arg: { name: "
+    "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: "
+    "DT_FLOAT} "
+    "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
+
 // Converts the toco::IODataType to tensorflow::DataType. Only contains the
 // conversion mapping for constants defined in TFLite Python API.
 DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
@@ -260,6 +295,8 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   std::vector<string> extra_tf_opdefs(toco_flags.custom_opdefs().begin(),
                                       toco_flags.custom_opdefs().end());
   extra_tf_opdefs.push_back(kDetectionPostProcessOp);
+  extra_tf_opdefs.push_back(kUnidirectionalSequenceLstmOp);
+  extra_tf_opdefs.push_back(kUnidirectionalSequenceRnnOp);
   TF_RETURN_IF_ERROR(RegisterCustomBuiltinOps(extra_tf_opdefs));
 
   TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 45e87e63475..617f968b958 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/QuantOps/FakeQuantSupport.h"  // TF:llvm-project
 #include "mlir/Dialect/QuantOps/QuantOps.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/AffineExpr.h"  // TF:llvm-project
 #include "mlir/IR/AffineMap.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index eca95cbadec..2f677397109 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -61,11 +61,9 @@ TfLiteStatus QuantizeModel(
   std::string serialized_model(
       reinterpret_cast<const char*>(input_builder.GetBufferPointer()),
       input_builder.GetSize());
-  std::vector<std::string> output_arrays_order;
 
-  OwningModuleRef module =
-      tflite::FlatBufferToMlir(serialized_model, &context,
-                               UnknownLoc::get(&context), output_arrays_order);
+  OwningModuleRef module = tflite::FlatBufferToMlir(serialized_model, &context,
+                                                    UnknownLoc::get(&context));
   if (!module) {
     error_reporter->Report("Couldn't import flatbuffer to MLIR.");
     return kTfLiteError;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index b2355b2ae6e..5f52c892421 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/QuantOps/QuantOps.h"  // TF:llvm-project
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index ed998510328..9bb1d677df2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/QuantOps/FakeQuantSupport.h"  // TF:llvm-project
 #include "mlir/Dialect/QuantOps/QuantOps.h"  // TF:llvm-project
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
@@ -191,7 +191,7 @@ struct QuantizationPattern : public RewritePattern {
         auto ele_type = operand.getType().cast<TensorType>().getElementType();
         if (auto op_inst = dyn_cast_or_null<DQ>(operand.getDefiningOp())) {
           inputs.push_back(op_inst.input());
-        } else if (ele_type.isa<IntegerType>()) {
+        } else if (ele_type.isSignlessInteger()) {
           // If the operand is an integer tensor, then it doesn't require the
           // DQ op in the pattern.
           inputs.push_back(operand);
@@ -225,7 +225,7 @@ struct QuantizationPattern : public RewritePattern {
           auto user = llvm::cast<Q>(*result.user_begin());
           outputs_replaced.insert({user.output(), enumerated_result.index()});
           output_types.push_back(user.getType());
-        } else if (result_ele_type.template isa<IntegerType>()) {
+        } else if (result_ele_type.isSignlessInteger()) {
           // If the result is an integer tensor, then it doesn't require the
           // D op in the pattern.
           outputs_replaced.insert({result, enumerated_result.index()});
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/materialize.cc b/tensorflow/compiler/mlir/lite/quantization/xla/materialize.cc
index 7c2846231c9..0c746d0c943 100644
--- a/tensorflow/compiler/mlir/lite/quantization/xla/materialize.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/materialize.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/QuantOps/QuantOps.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
index d0358891aaa..c05337918f2 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
@@ -48,11 +48,9 @@ TfLiteStatus SparsifyModel(const tflite::ModelT& input_model,
   std::string serialized_model(
       reinterpret_cast<const char*>(input_builder.GetBufferPointer()),
       input_builder.GetSize());
-  std::vector<std::string> output_arrays_order;
 
-  OwningModuleRef module =
-      tflite::FlatBufferToMlir(serialized_model, &context,
-                               UnknownLoc::get(&context), output_arrays_order);
+  OwningModuleRef module = tflite::FlatBufferToMlir(serialized_model, &context,
+                                                    UnknownLoc::get(&context));
   if (!module) {
     error_reporter->Report("Couldn't import flatbuffer to MLIR.");
     return kTfLiteError;
diff --git a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
index a6d6ec52234..5fe5fbfb3ee 100644
--- a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
@@ -27,6 +27,20 @@ func @testDilatedConvWithNonZeroSTBPadding(%arg0: tensor<1x128x128x3xf32>, %arg1
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
+func @testDilatedConvWithNonTrivialDilations(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32> {
+  %cst = constant dense<[2, 2]> : tensor<2xi32>
+  %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
+  %1 = "tf.Conv2D"(%0, %arg2) {padding = "VALID", dilations = [1, 2, 2, 1], strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %arg1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  return %2 : tensor<1x128x128x8xf32>
+
+  // CHECK-LABEL: testDilatedConvWithNonTrivialDilations
+  // CHECK: [[STB:%.*]] = "tf.SpaceToBatchND"
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.BatchToSpaceND"
+  // CHECK-NEXT: return [[RESULT]]
+}
+
 func @testDilatedDepthWiseConv(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
@@ -104,7 +118,7 @@ func @testDilatedDepthWiseConvWithBiasAdd(%arg0: tensor<1x128x128x3xf32>, %arg1:
 
 func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<3> : tensor<i32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.Conv2D"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
@@ -115,7 +129,7 @@ func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: ten
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze1
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = constant dense<3> : tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
@@ -125,7 +139,7 @@ func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: ten
 
 func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<3> : tensor<i32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.DepthwiseConv2dNative"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
@@ -136,7 +150,7 @@ func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze1
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = constant dense<3> : tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
@@ -146,7 +160,7 @@ func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %
 
 func @testDilatedConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<?xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<3> : tensor<i32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x?x?xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x?x?xf32>, tensor<i32>) -> tensor<4x?x?x1xf32>
   %2 = "tf.Conv2D"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x?x?x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x?x?x1xf32>
@@ -157,7 +171,7 @@ func @testDilatedConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: ten
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze2
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<?xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = constant dense<3> : tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
@@ -167,7 +181,7 @@ func @testDilatedConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: ten
 
 func @testDilatedDepthWiseConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<?xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<3> : tensor<i32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x?x?xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x?x?xf32>, tensor<i32>) -> tensor<4x?x?x1xf32>
   %2 = "tf.DepthwiseConv2dNative"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x?x?x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x?x?x1xf32>
@@ -178,7 +192,7 @@ func @testDilatedDepthWiseConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze2
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<?xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = constant dense<3> : tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
@@ -188,7 +202,7 @@ func @testDilatedDepthWiseConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %
 
 func @testDilatedConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<3> : tensor<i32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.Conv2D"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
@@ -200,7 +214,7 @@ func @testDilatedConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: ten
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze3
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = constant dense<3> : tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
@@ -210,7 +224,7 @@ func @testDilatedConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: ten
 
 func @testDilatedDepthWiseConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<3> : tensor<i32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.DepthwiseConv2dNative"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
@@ -222,10 +236,29 @@ func @testDilatedDepthWiseConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze3
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
-  // CHECK-NEXT: [[AXIS:%.*]] = constant dense<3> : tensor<i32>
+  // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
 }
+
+func @testDilatedConvWithDifferentExpandSqueezeAxis(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128x1xf32> {
+  %cst = constant dense<[2, 2]> : tensor<2xi32>
+  %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+  %0 = "tf.SpaceToBatchND"(%arg0, %cst, %arg1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
+  %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
+  %2 = "tf.Conv2D"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
+  %3 = "tf.Squeeze"(%2) {squeeze_dims = [2]} : (tensor<4x64x64x1xf32>) -> tensor<4x64x64x1xf32>
+  %4 = "tf.BatchToSpaceND"(%3, %cst, %arg1) : (tensor<4x64x64x1xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x1xf32>
+  return %4 : tensor<1x128x128x1xf32>
+
+  // CHECK-LABEL: testDilatedConvWithDifferentExpandSqueezeAxis
+  // CHECK: [[STB:%.*]] = "tf.SpaceToBatchND"
+  // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"
+  // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.BatchToSpaceND"
+  // CHECK-NEXT: return [[RESULT]]
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/input_arrays.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/input_arrays.mlir
new file mode 100644
index 00000000000..b9b62cdc220
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/input_arrays.mlir
@@ -0,0 +1,13 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate -input-arrays=squared_difference --experimental-prune-unreachable-nodes-unconditionally --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Tests -input-arrays flag.
+
+func @main(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %1 = "tfl.squared_difference"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("squared_difference")
+  %2 = "tfl.mul"(%0, %1) {fused_activation_function = "NONE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> loc("mul")
+  return %2 : tensor<4xf32>
+
+// CHECK-LABEL: main
+// CHECK-NOT: tfl.squared_difference
+// CHECK: tfl.mul %[[CONST:.*]], %arg0
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
new file mode 100644
index 00000000000..6003471f106
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -0,0 +1,15 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
+// Ensure lstm roundtrip exactly
+
+func @main(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>) -> tensor<4 x f32> {
+  %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %24 : tensor<4xf32>
+// CHECK-LABEL: main
+// seperate lines since there is no region for this op. third_party/tensorflow/compiler/mlir/lite/ir/tfl_ops.td: 3252
+// CHECK: %[[RES0:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg22, %arg23, %arg18, %arg19, %arg20, %arg21) ( {
+// CHECK:  }) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK: return %[[RES0]]
+
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index e40047ea216..1256571c3b4 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -123,6 +123,17 @@ func @softmax(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
 // CHECK:  "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x16xf32>) -> tensor<8x16xf32>
 }
 
+func @softplus(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %0 = "tf.Softplus"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+
+// CHECK-LABEL: softplus
+// CHECK-NEXT:  %[[cst:.*]] = constant dense<1.000000e+00> : tensor<f32>
+// CHECK-NEXT:  %[[exp:.*]] = "tfl.exp"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK-NEXT:  %[[add:.*]] = "tfl.add"(%[[exp]], %[[cst]]) {fused_activation_function = "NONE"} : (tensor<8x16xf32>, tensor<f32>) -> tensor<8x16xf32>
+// CHECK-NEXT:  %[[log:.*]] = "tfl.log"(%[[add]]) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+}
+
 func @fakeQuantArgsFalse(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
   %0 = "tf.FakeQuantWithMinMaxArgs"(%arg0) {min = -0.1 : f32, max = 0.2 : f32, num_bits = 3, narrow_range = false} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   return %0 : tensor<8x8x8x8xf32>
@@ -1453,3 +1464,19 @@ func @LstmWithProjection(%arg: tensor<28x1x16xf32>) -> (tensor<28x1x8xf32>) {
 // CHECK:           [[VAL_15:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_12]], [[VAL_14]], [[VAL_13]], [[VAL_11]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_14]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, none, none, none, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, none, tensor<1x8xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x8xf32>
 // CHECK:           return [[VAL_15]] : tensor<28x1x8xf32>
 // CHECK:         }
+
+func @UnidirectionalRnn(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x28xf32>) {
+  %1 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<28x28xf32>} : () -> tensor<28x28xf32>
+  %2 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<28xf32>} : () -> tensor<28xf32>
+  %3 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<1x28xf32>} : () -> tensor<1x28xf32>
+  %4:2 = "tf.UnidirectionalSequenceRnn"(%arg, %1, %1, %2, %3) {_tflite_input_indices = [0, 1, 2, 3, 4], device = ""} : (tensor<28x1x28xf32>, tensor<28x28xf32>, tensor<28x28xf32>, tensor<28xf32>, tensor<1x28xf32>) -> (tensor<*xf32>, tensor<28x1x28xf32>)
+  return %4#1 : tensor<28x1x28xf32>
+}
+
+// CHECK:       func @UnidirectionalRnn([[VAL_0:%.*]]: tensor<28x1x28xf32>) -> tensor<28x1x28xf32> {
+// CHECK:           [[VAL_1:%.*]] = constant dense<0.000000e+00> : tensor<28x28xf32>
+// CHECK:           [[VAL_2:%.*]] = constant dense<0.000000e+00> : tensor<28xf32>
+// CHECK:           [[VAL_3:%.*]] = constant dense<0.000000e+00> : tensor<1x28xf32>
+// CHECK:           [[VAL_4:%.*]] = "tfl.unidirectional_sequence_rnn"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_3]]) {fused_activation_function = "TANH", time_major = true} : (tensor<28x1x28xf32>, tensor<28x28xf32>, tensor<28x28xf32>, tensor<28xf32>, tensor<1x28xf32>) -> tensor<28x1x28xf32>
+// CHECK:           return [[VAL_4]] : tensor<28x1x28xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index a1369fe969a..da58b3704d0 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -878,6 +878,14 @@ func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 
 // -----
 
+func @packUnranked(%arg0: tensor<2xi32>, %arg1: tensor<*xi32>) -> tensor<2x2xi32> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<*xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// -----
+
 func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x4x2xi32> {
   // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32}
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x4x2xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 3b72a60f3c6..448a4f9eb5f 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -154,7 +154,7 @@ func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3x4xf3
 // -----
 
 module {
-func @inference_standard_lstm_7410(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x?x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.signature.is_stateful} {
+func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x?x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
   %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
   %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
@@ -165,7 +165,7 @@ func @inference_standard_lstm_7410(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10x
   return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x?x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_7410([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> tensor<?x8x10xf32> attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.signature.is_stateful} {
+// CHECK:       func @inference_standard_lstm_time_major([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> tensor<8x?x10xf32> attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
 // CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
@@ -181,7 +181,46 @@ func @inference_standard_lstm_7410(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10x
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
 // CHECK:           [[VAL_20:%.*]] = "tfl.lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) ( {
-// CHECK:           }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
-// CHECK:           return [[VAL_21:%.*]] : tensor<?x8x10xf32>
-
+// CHECK:           }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x?x10xf32>
+// CHECK:           return [[VAL_21:%.*]] : tensor<8x?x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+module {
+func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
+  %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<8x8x40xf32>, tensor<10x40xf32>) -> tensor<8x8x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<8x8x10xf32>, tensor<?x10xf32>) -> tensor<8x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<8x8x10xf32>, tensor<?x10xf32>) -> tensor<8x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+}
+
+// CHECK:       func @inference_standard_lstm_non_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> tensor<8x8x10xf32> attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0, 2]> : tensor<3xi64>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_0]], [[VAL_6]]) : (tensor<8x8x8xf32>, tensor<3xi64>) -> tensor<8x8x8xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_10:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
+// CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi64>) -> tensor<40x10xf32>
+// CHECK:           [[VAL_12:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_13:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_14:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_12]], [[VAL_13]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK:           [[VAL_15:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_16:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_17:%.*]]:4 = "tf.SplitV"([[VAL_11]], [[VAL_15]], [[VAL_16]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
+// CHECK:           [[VAL_18:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_21:%.*]] = constant unit
+// CHECK:           [[VAL_22:%.*]] = "tfl.lstm"([[VAL_0]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) ( {
+// CHECK:           }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<[1, 0, 2]> : tensor<3xi64>
+// CHECK:           [[VAL_24:%.*]] = "tf.Transpose"([[VAL_25:%.*]], [[VAL_23]]) : (tensor<8x8x10xf32>, tensor<3xi64>) -> tensor<8x8x10xf32>
+// CHECK:           return [[VAL_24]] : tensor<8x8x10xf32>
+// CHECK:         }
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index 68b8fe775bd..f937d0afd4d 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -622,3 +622,16 @@ func @QuantizeSharedBiases2(
 // CHECK: %{{.*}} = tfl.add %{{.*}}, %[[dq_0]]
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
 }
+
+// CHECK-LABEL: ReturnQuantizedResult
+func @ReturnQuantizedResult(%arg0: tensor<1x224x224x3xf32>, %arg1: tensor<32x3x3x3xf32>, %arg2: tensor<32xf32>) -> (tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) {
+  %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %arg2) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  %2 = "tfl.dequantize"(%1) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>) -> (tensor<1x112x112x32xf32>)
+  return %0, %2 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>
+
+// CHECK: %[[dw:.*]] = "tfl.depthwise_conv_2d"(%arg0, %arg1, %arg2)
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[dw]])
+// CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
+// CHECK: return %[[dq]], %[[dq]]
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir b/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
index 4b1ba842552..3608d89e5e3 100644
--- a/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
@@ -1,22 +1,28 @@
 // Test to verify loop outlining.
 
 // RUN: tf-opt --split-input-file --tfl-while-loop-outline %s | FileCheck %s --dump-input-on-failure
+// Check that while loop outlining is nop if re-ran.
+// RUN: tf-opt --tfl-while-loop-outline %s -o %t1
+// RUN: tf-opt --tfl-while-loop-outline %t1 -o %t2
+// RUN: diff %t1 %t2
 
 // CHECK-LABEL: func @while
 func @while() -> tensor<1xf32>
     attributes {tf.entry_function = {outputs = "result"}} {
   %cst = constant dense<1> : tensor<i32> loc("dec")
-  %arg0 = constant dense<5> : tensor<i32> loc("N")
-  %arg1 = constant dense<3.0> : tensor<1xf32> loc("val")
-  %0:2 = "tfl.while"(%arg0, %arg1) ( {
+  %cst0 = constant dense<5> : tensor<i32> loc("N")
+  %cst1 = constant dense<3.0> : tensor<1xf32> loc("val")
+  %0:2 = "tfl.while"(%cst0, %cst1) ( {
     ^bb0(%arg2: tensor<*xi32>, %arg3: tensor<*xf32>):
       // CHECK: call @WhileOp_cond
+      // CHECK-SAME: (tensor<*xi32>, tensor<*xf32>, tensor<i32>)
       %cst_0 = constant dense<0> : tensor<i32>
       %1 = "tfl.greater"(%arg2, %cst_0) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
       "tfl.yield"(%1) : (tensor<i1>) -> ()
   },  {
     ^bb0(%arg2: tensor<*xi32>, %arg3: tensor<*xf32>):
       // CHECK: call @WhileOp_body
+      // CHECK-SAME: (tensor<*xi32>, tensor<*xf32>, tensor<i32>)
       %1 = "tfl.sub"(%arg2, %cst) {fused_activation_function = "NONE"} :
         (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
       %2 = tfl.add %arg3, %arg3 {fused_activation_function = "NONE"} : tensor<*xf32>
@@ -32,6 +38,52 @@ func @while() -> tensor<1xf32>
 
 // -----
 
+// CHECK-LABEL: func @while2
+// Verify that while body//cond with implicitly captured values result in changing while operands/results.
+func @while2() -> tensor<1xf32> attributes {tf.entry_function = {outputs = "result"}} {
+  %cst = constant dense<1> : tensor<i32>
+  %cst_0 = constant dense<5> : tensor<i32>
+  %cst_1 = constant dense<3.000000e+00> : tensor<1xf32>
+  // Verifies 3 operands post outlining.
+  // CHECK: "tfl.while"({{.*}}, {{.*}}, {{.*}}) (
+  %0:2 = "tfl.while"(%cst_0, %cst_1) ( {
+  ^bb0(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>):   // no predecessors
+    // CHECK: call @WhileOp_cond
+    // CHECK-SAME: (tensor<*xi32>, tensor<*xf32>, tensor<i32>)
+    %1 = call @WhileOp_cond(%arg0, %arg1, %cst) : (tensor<*xi32>, tensor<*xf32>, tensor<i32>) -> tensor<i1>
+    "tfl.yield"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>):   // no predecessors
+    // CHECK: call @WhileOp_body
+    // CHECK-SAME: (tensor<*xi32>, tensor<*xf32>, tensor<i32>)
+    %1:3 = call @WhileOp_body(%arg0, %arg1, %cst) : (tensor<*xi32>, tensor<*xf32>, tensor<i32>) -> (tensor<*xi32>, tensor<*xf32>, tensor<i32>)
+    "tfl.yield"(%1#0, %1#1) : (tensor<*xi32>, tensor<*xf32>) -> ()
+  }) : (tensor<i32>, tensor<1xf32>) -> (tensor<i32>, tensor<1xf32>) loc("WhileOp")
+  // CHECK: (tensor<i32>, tensor<1xf32>, tensor<i32>) ->
+  // CHECK-SAME: (tensor<i32>, tensor<1xf32>, tensor<i32>)
+  return %0#1 : tensor<1xf32>
+}
+
+func @WhileOp_cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>, %arg2: tensor<i32>) -> tensor<i1> attributes {sym_visibility = "private"} {
+  %cst = constant dense<0> : tensor<i32>
+  %0 = "tfl.greater"(%arg0, %cst) : (tensor<*xi32>, tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+func @WhileOp_body(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>, %arg2: tensor<i32>) -> (tensor<*xi32>, tensor<*xf32>, tensor<i32>) attributes {sym_visibility = "private"} {
+  %0 = "tfl.sub"(%arg0, %arg2) {fused_activation_function = "NONE"} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %1 = tfl.add %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<*xf32>
+  return %0, %1, %arg2 : tensor<*xi32>, tensor<*xf32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @WhileOp_cond(
+// CHECK: tfl.greater
+// CHECK-LABEL: func @WhileOp_body(
+// CHECK: tfl.sub
+// CHECK: tfl.add
+
+// -----
+
 func @rnn(%arg0: tensor<4x4x3xf32> {tf.device = "/device:CPU:0"}) -> tensor<4x?x2xf32> attributes {tf.entry_function = {inputs = "Placeholder", outputs = "rnn/transpose_1"}} {
   %cst = constant dense<0.000000e+00> : tensor<4x2xf32>
   %cst_0 = constant dense<0.000000e+00> : tensor<8xf32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 0472bd6abcf..30fe391762f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
index c3d3df14e0b..65bed845bae 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -80,6 +81,17 @@ class ConvertTFDilatedConvOp : public OpRewritePattern<Conv2dOpTy> {
 template <typename Conv2dOpTy>
 PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     Conv2dOpTy op, PatternRewriter& rewriter) const {
+  // Make sure Conv2D has 'VALID' padding.
+  if (op.template getAttrOfType<StringAttr>("padding").getValue() != "VALID") {
+    return Pattern::matchFailure();
+  }
+  // Make sure dilations are all ones if set.
+  const ArrayAttr& dilations =
+      op.template getAttrOfType<ArrayAttr>("dilations");
+  if (dilations && !TFIntListIsAllOnes(dilations)) {
+    return Pattern::matchFailure();
+  }
+
   // Check if the ConvOp is preceded by a `Expand` op and succeeded by a
   // `Squeeze` op.
   Operation* prev_op = op.getOperation()->getPrevNode();
@@ -90,6 +102,7 @@ PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
 
   TF::ExpandDimsOp expand_op;
   TF::SqueezeOp squeeze_op;
+  int64_t expand_axis;
   // Expand + Squeeze op.
   if (llvm::isa<TF::ExpandDimsOp>(prev_op)) {
     if (!llvm::isa<TF::SqueezeOp>(next_op)) {
@@ -99,6 +112,22 @@ PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     expand_op = llvm::cast<TF::ExpandDimsOp>(prev_op);
     squeeze_op = llvm::cast<TF::SqueezeOp>(next_op);
 
+    // Make sure that the axis in `expand_op` is constant.
+    if (auto const_op =
+            llvm::dyn_cast<TF::ConstOp>(expand_op.dim().getDefiningOp())) {
+      expand_axis =
+          (*const_op.value().cast<DenseElementsAttr>().getIntValues().begin())
+              .getSExtValue();
+    } else {
+      return Pattern::matchFailure();
+    }
+    // Make sure that the `squeeze_dims` is equal to `expand_axis`.
+    auto squeeze_dims = squeeze_op.squeeze_dims();
+    if (squeeze_dims.size() != 1 ||
+        squeeze_dims[0].cast<IntegerAttr>().getInt() != expand_axis) {
+      return Pattern::matchFailure();
+    }
+
     // Update previous/next op pointer.
     prev_op = prev_op->getPrevNode();
     if (!prev_op) return Pattern::matchFailure();
@@ -108,10 +137,14 @@ PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
 
   // SpaceToBatchND op.
   if (!llvm::isa<TF::SpaceToBatchNDOp>(prev_op)) return Pattern::matchFailure();
+  // TODO(b/149936532): Check `padding` input, currently ignored.
   TF::SpaceToBatchNDOp stb_op = llvm::cast<TF::SpaceToBatchNDOp>(prev_op);
 
   // Pad op.
   TF::PadOp pad_op;
+  // TODO(b/149936532): Currently we just ignore the PadOp. However note that
+  // in real scenarios this may not always be correct: user can put a PadOp here
+  // with non-trivial consequences.
   if (llvm::isa<TF::PadOp>(next_op)) {
     pad_op = llvm::cast<TF::PadOp>(next_op);
     next_op = next_op->getNextNode();
@@ -119,6 +152,7 @@ PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   }
 
   // BatchToSpaceND + BiasAdd.
+  // TODO(b/149936532): Check the `crops` input, currently ignored.
   TF::BatchToSpaceNDOp bts_op;
   TF::BiasAddOp biasadd_op;
   bool final_op_is_bts = true;
@@ -146,14 +180,10 @@ PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   if (!dilations_attr.hasValue()) return Pattern::matchFailure();
   op.setAttr("dilations", dilations_attr.getValue());
 
-  // Here we need to set the correct padding for Conv op. In TF, the conv op
-  // inserted after 'SpaceToBatch' always has 'VALID' padding. This might
-  // become a problem here if the original Conv op has 'SAME' padding. When
-  // the original conv has 'SAME' padding, TF will set a non-zero padding for
-  // the 'SpaceToBatch' op, so we rely on this information to check if we need
-  // to change the padding from 'VALID' to 'SAME' (a.k.a when we see non-zero
-  // values in `stb_op.paddings`, we change the current Conv's padding to
-  // 'SAME').
+  // Padding is set to 'SAME' when `stb_op` has non-zero paddings.
+  // TODO(b/149936532): This assumption only holds when the input width & height
+  // is multiple of dilation width & height. We should fix it in order to
+  // support other use cases.
   auto stb_paddings = stb_op.paddings();
   ElementsAttr stb_paddings_attr;
   if (matchPattern(stb_paddings, m_Constant(&stb_paddings_attr))) {
@@ -175,7 +205,8 @@ PatternMatchResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     auto input_shape = stb_op.input().getType().cast<ShapedType>().getShape();
     SmallVector<int64_t, 4> expand_shape(input_shape.begin(),
                                          input_shape.end());
-    expand_shape.push_back(1);
+    expand_shape.insert(expand_shape.begin() + expand_axis, 1);
+
     auto expand_result_type = RankedTensorType::get(
         expand_shape, getElementTypeOrSelf(stb_op.input()));
     expand_op.getResult().setType(expand_result_type);
@@ -208,7 +239,7 @@ ConvertTFDilatedConvOp<Conv2dOpTy>::ExtractDilationsAttrFromBlockShape(
   ElementsAttr stb_bs_attr, bts_bs_attr;
   if (!matchPattern(stb_block_shape, m_Constant(&stb_bs_attr)) ||
       !matchPattern(bts_block_shape, m_Constant(&bts_bs_attr))) {
-    // Returns failure status if block shape is not a constant.
+    // Returns failure status if block_shape is not a constant.
     return {};
   }
   // Check that the block_shape of `stb_op` and `bts_op` are equal.
@@ -217,9 +248,8 @@ ConvertTFDilatedConvOp<Conv2dOpTy>::ExtractDilationsAttrFromBlockShape(
     if (stb_bs_attr.getValue({i}) != bts_bs_attr.getValue({i})) return {};
   }
 
-  // TODO(haoliang): support 1-D dilated conv.
+  // Set dilation factor.
   if (stb_bs_attr.getNumElements() < 2) return {};
-
   int dilation_h_factor =
       stb_bs_attr.getValue({0}).cast<IntegerAttr>().getInt();
   int dilation_w_factor =
diff --git a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
index e07cea8535e..3582046f13f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
index e31b143ab43..f3a15b7ebd3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_ophint_func_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 7bc08ee1c76..683905d06c7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // TFLite legalization patterns
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
@@ -167,6 +167,7 @@ def : Pat<(TF_SigmoidOp $arg), (TFL_LogisticOp $arg)>;
 def : Pat<(TF_SinOp F32Tensor:$arg), (TFL_SinOp $arg)>;
 def : Pat<(TF_SliceOp $input, $begin, $size), (TFL_SliceOp $input, $begin, $size)>;
 def : Pat<(TF_SoftmaxOp $arg), (TFL_SoftmaxOp $arg, ConstF32Attr<"1.0">)>;
+def : Pat<(TF_SoftplusOp F32Tensor:$arg0), (TFL_LogOp (TFL_AddOp (TFL_ExpOp $arg0), (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "1.0f">), TFL_AF_None))>;
 def : Pat<(TF_SqueezeOp $arg, $squeeze_dims), (TFL_SqueezeOp $arg, $squeeze_dims)>;
 def : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
 def : Pat<(TF_TransposeOp $arg, $perm), (TFL_TransposeOp $arg, $perm)>;
@@ -340,7 +341,7 @@ def : Pat<(TF_MatrixDiagOp $diagonal), (TFL_MatrixDiagOp $diagonal)>;
 class I32VectorElementsAttr<int len> : ElementsAttrBase<
   CPred<"$_self.isa<DenseIntElementsAttr>() &&"
       "$_self.cast<DenseIntElementsAttr>().getType()."
-      "getElementType().isInteger(32)">,
+      "getElementType().isSignlessInteger(32)">,
   "32-bit int elements attribute of shape [" # len # "]"> {
 
   let storageType = [{ DenseIntElementsAttr }];
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 7501832099a..cf24ed7e0f4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/Dialect/QuantOps/FakeQuantSupport.h"  // TF:llvm-project
 #include "mlir/Dialect/QuantOps/UniformSupport.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
@@ -64,6 +65,7 @@ using xla::Status;
 using xla::StatusOr;
 
 constexpr char kUnidirectionalSequenceLstm[] = "tf.UnidirectionalSequenceLstm";
+constexpr char kUnidirectionalSequenceRnn[] = "tf.UnidirectionalSequenceRnn";
 constexpr char kTfLiteInputIndices[] = "_tflite_input_indices";
 
 // Legalize operations in functions.
@@ -253,7 +255,7 @@ PatternMatchResult ConvertTFReshapeOp::matchAndRewrite(
 
   ShapedType shape_type = shape.getType().cast<ShapedType>();
   // The tfl reshape's #2 operand needs to i32 tensor type, so we have to cast.
-  if (!shape_type.getElementType().isInteger(32)) {
+  if (!shape_type.getElementType().isSignlessInteger(32)) {
     auto new_shape = shape_type.getShape();
     IntegerType new_ele_type = rewriter.getIntegerType(32);
     ShapedType new_type = RankedTensorType::get(new_shape, new_ele_type);
@@ -632,6 +634,66 @@ struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
   }
 };
 
+// Legalize unidirectional seqeucen rnn.
+struct LegalizeUnidirectionalSequenceRnn : public RewritePattern {
+  explicit LegalizeUnidirectionalSequenceRnn(MLIRContext* context)
+      : RewritePattern(kUnidirectionalSequenceRnn, 1, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation* op,
+                                     PatternRewriter& rewriter) const override {
+    auto tflite_indices_attr =
+        op->getAttrOfType<ArrayAttr>(kTfLiteInputIndices);
+    if (!tflite_indices_attr) return matchFailure();
+
+    if (op->getNumOperands() != 5) {
+      op->emitError()
+          << "We're expecting 5 inputs for UnidirectionalSequenceRNN, only "
+          << op->getNumOperands() << " provided";
+      return matchFailure();
+    }
+
+    if (op->getNumResults() != 2) {
+      op->emitError()
+          << "We're expecting 2 inputs for UnidirectionalSequenceRNN, only "
+          << op->getNumResults() << " found";
+      return matchFailure();
+    }
+
+    // Populate inputs.
+    // UnidirectionalSequenceRnn is expected to have 5 inputs, and none of them
+    // are optional inputs.
+    SmallVector<Value, 5> inputs;
+    for (int i = 0; i < 5; ++i) {
+      inputs.push_back(op->getOperand(i));
+    }
+
+    // Populate outputs.
+    // UnidirectionalSequenceRnn should only have 1 output, and that is the
+    // original ophint converted node's 2nd output.
+    SmallVector<Type, 4> result_types;
+    result_types.push_back(op->getOpResult(1).getType());
+
+    // Populate attributes.
+    SmallVector<NamedAttribute, 2> attributes;
+    // Activation will always be tanh.
+    attributes.push_back(rewriter.getNamedAttr("fused_activation_function",
+                                               rewriter.getStringAttr("TANH")));
+
+    // will always be time_majored.
+    attributes.push_back(
+        rewriter.getNamedAttr("time_major", rewriter.getBoolAttr(true)));
+
+    auto rnn_op = rewriter.create<TFL::UnidirectionalSequenceRNNOp>(
+        op->getLoc(), result_types, inputs, attributes);
+
+    // Rewire the output.
+    op->getResult(1).replaceAllUsesWith(rnn_op.getResult());
+    op->erase();
+
+    return matchSuccess();
+  }
+};
+
 void LegalizeTF::runOnFunction() {
   OwningRewritePatternList patterns;
   auto* ctx = &getContext();
@@ -647,7 +709,8 @@ void LegalizeTF::runOnFunction() {
               ConvertTFReciprocalOp, ConvertTFRandomUniformOp>(ctx);
 
   // Ophint python converter converted tf node pattern.
-  patterns.insert<LegalizeUnidirectionalSequenceLstm>(ctx);
+  patterns.insert<LegalizeUnidirectionalSequenceLstm,
+                  LegalizeUnidirectionalSequenceRnn>(ctx);
   applyPatternsGreedily(func, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
index 7d1dbbb9fcc..ea44a34eb2b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Converts TF While to TFL While with single call in body and cond.
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
index 3349261af02..4fde08bc1cf 100644
--- a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 1b240e2e674..00159644185 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
@@ -335,8 +335,9 @@ struct ConvertTensorListInitOp : public OpConversionPattern<OpT> {
       ConversionPatternRewriter &rewriter) const override {
     Type dtype = op.element_dtype();
     if (!(dtype.isF16() || dtype.isF32() || dtype.isF64() ||
-          dtype.isInteger(1) || dtype.isInteger(8) || dtype.isInteger(16) ||
-          dtype.isInteger(32) || dtype.isInteger(64))) {
+          dtype.isInteger(1) || dtype.isSignlessInteger(8) ||
+          dtype.isSignlessInteger(16) || dtype.isSignlessInteger(32) ||
+          dtype.isSignlessInteger(64))) {
       op.emitError(
           "requires element_dtype to be 1-bit/8-bit/16-bit/32-bit/64-bit "
           "integer or 16-bit/32-bit/64-bit float type during TF Lite "
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 112e7f788ce..dbc12a85b67 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Matchers.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 71017fe2801..0ad5be055dc 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the optimization pattern definition file for TensorFlow Lite.
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/post_quantize_patterns.td
index 283b29ea005..ecceba5316e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the quantization pattern definition file for TensorFlow Lite.
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Both Quantize and Dequantize ops have side effects, so we have to define
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 7181877085d..98f9c73f791 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 02fcbd7e11c..316a9d2cf2a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/QuantOps/QuantOps.h"  // TF:llvm-project
@@ -115,6 +116,10 @@ class PrepareQuantizePass : public FunctionPass<PrepareQuantizePass> {
     }
   }
 
+  // Apply some sanity check and report some warnings for those don't follow
+  // the best quantization practise. This also fixes some simple violations.
+  void SanityCheckAndAdjustment(FuncOp func);
+
   QuantizationSpecs quant_specs_;
 };
 
@@ -184,13 +189,56 @@ bool PrepareQuantizePass::RemoveRedundantStats(FuncOp func) {
   return RemoveRedundantStatsOps(func, GetOpQuantSpec);
 }
 
+static Value Quantized(Operation* user) {
+  if (auto q = llvm::dyn_cast_or_null<quant::QuantizeCastOp>(user)) {
+    if (auto dq = llvm::dyn_cast_or_null<quant::DequantizeCastOp>(
+            *q.getResult().user_begin())) {
+      return dq.getResult();
+    }
+  }
+  return {};
+}
+
+void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
+  // If an op output has two users: one of them is a quantize op and another
+  // one is returned directly, we decide to return the quantized result instead,
+  // so this op can be quantized. This is only applied on the returned result
+  // because the error will not be accumulated.
+  func.walk([&](ReturnOp ret) {
+    int i = 0;
+    for (Value returned : ret.operands()) {
+      llvm::SmallVector<Value, 4> quantized;
+      for (auto user : returned.getUsers()) {
+        if (auto q = Quantized(user)) {
+          quantized.push_back(q);
+        }
+      }
+      if (quantized.size() == 1) {
+        ret.setOperand(i, quantized.front());
+      }
+      i++;
+    }
+  });
+
+  // We prefer to placing quantization emulation ops on the results of the
+  // concat ops.
+  func.walk([&](ConcatenationOp concat) {
+    if (concat.output().hasOneUse() &&
+        Quantized(*concat.output().user_begin())) {
+      return;
+    }
+    concat.emitWarning(
+        "Missing quantization parameter on the output might introduce "
+        "quantization error!");
+  });
+}
+
 using PrepareQuantStats =
     quant::ConvertStatsToQDQs<quant::QuantizeCastOp, quant::DequantizeCastOp>;
 
 void PrepareQuantizePass::runOnFunction() {
   FuncOp func = getFunction();
   MLIRContext* ctx = func.getContext();
-
   ConvertTFLQuantOpsToMlirQuantOps(func);
 
   if (quant_specs_.post_training_quantization) {
@@ -220,6 +268,8 @@ void PrepareQuantizePass::runOnFunction() {
   }
   applyPatternsGreedily(func, patterns);
 
+  SanityCheckAndAdjustment(func);
+
   // Finally, the quantization parameters can be propagated to the rest of the
   // values (tensors).
   ApplyQuantizationParamsPropagation(func, is_signed, disable_per_channel,
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index 07dd8ab4455..22bcc563f7b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the quantization pattern definition file for TensorFlow Lite.
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Quantize attribute $0 by using quantization parameter from %1.
diff --git a/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc b/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc
index 17125bffd85..c8aa67084ce 100644
--- a/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td b/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
index ff024ad0463..b0435b7cf4c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
index 5a7397ed9c9..13afa1bf9b8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/trim_functions_tf.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Identifier.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index 45d1b21ceb3..8ed5b0e0341 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/CommandLine.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Identifier.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
@@ -52,25 +52,47 @@ class WhileOutlinePass : public mlir::ModulePass<WhileOutlinePass> {
 
   tensorflow::OpOrArgLocNameMapper mapper_;
 };
+}  // namespace
 
 std::string WhileOutlinePass::GetName(Operation* op, StringRef suffix) {
   return (mapper_.GetUniqueName(op) + suffix).str();
 }
 
+// Returns whether the WhileOp is already outlined (e.g., only consists of calls
+// to functions).
+static bool IsAlreadyOutlinedd(WhileOp while_op) {
+  auto just_call = [](Region& region) {
+    auto it = region.front().begin();
+    if (!isa<CallOp>(*it)) return false;
+    ++it;
+    if (!isa<YieldOp>(*it)) return false;
+    return true;
+  };
+  return just_call(while_op.body()) && just_call(while_op.cond());
+}
+
 void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
   OpBuilder builder(&getContext());
-  // Colect external values used. Note: if an external value is also passed in
-  // via argument, then it could end up being passed in multiple times. In the
-  // case where the value was already just passed through, this will result in
-  // redundancy.
+  // Collect external values used.
   llvm::SetVector<Value> extern_values;
 
-  // Sink down none type constants into the functions.
+  // The basic block arguments correspond to values that are loop carried, while
+  // all those post are loop independent. Initialize extern_values with while_op
+  // not loop carried operands.
+  auto num_loop_carried = while_op.cond().front().getNumArguments();
+  auto not_carried_operands =
+      while_op.getOperands().drop_front(num_loop_carried);
+  extern_values.insert(not_carried_operands.begin(),
+                       not_carried_operands.end());
+  auto old_extern_values_size = extern_values.size();
+
   llvm::SmallVector<Region*, 2> regions{&while_op.cond(), &while_op.body()};
   for (auto it : llvm::enumerate(regions)) {
     llvm::SetVector<Value> region_extern_values;
     Value const_none = nullptr;
     getUsedValuesDefinedAbove(*it.value(), region_extern_values);
+
+    // Sink down none type constants into the functions.
     for (auto extern_value : region_extern_values) {
       if (!extern_value.getType().isa<NoneType>()) {
         extern_values.insert(extern_value);
@@ -89,12 +111,23 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
     }
   }
 
-  // Colect new types.
+  bool has_extra_extern_values = old_extern_values_size != extern_values.size();
+  // If an extern value is already an operand post the loop carried operands,
+  // then it need not be passed in again.
+  // Compute all the extra operands that have to be added to the while.
+  llvm::SetVector<Value> extra_operands;
+  if (has_extra_extern_values) {
+    auto new_extern =
+        extern_values.getArrayRef().drop_front(old_extern_values_size);
+    extra_operands.insert(new_extern.begin(), new_extern.end());
+  }
+
+  // Skip if already just calls.
+  if (extra_operands.empty() && IsAlreadyOutlinedd(while_op)) return;
+
+  // Collect new types.
   SmallVector<Type, 4> types;
-  types.reserve(extern_values.size() +
-                while_op.cond().front().getNumArguments());
-  // Type of block arguments are used as these could differ from those of While
-  // op, but has to match between cond and body.
+  types.reserve(extra_operands.size() + while_op.getNumOperands());
   for (BlockArgument ba : while_op.cond().front().getArguments())
     types.push_back(ba.getType());
   for (Value operand : extern_values) types.push_back(operand.getType());
@@ -119,7 +152,7 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
     outlined_func.getBody().takeBody(region);
     Region& func_region = outlined_func.getBody();
 
-    // Replace all external uses with block args and update uses..
+    // Replace all external uses with block args and update uses.
     llvm::SmallVector<Value, 4> new_args;
     new_args.reserve(extern_values.size());
     Block& block = func_region.front();
@@ -133,10 +166,12 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
     Operation* yield_op = outlined_func.getBody().front().getTerminator();
     OpBuilder b(yield_op);
     llvm::SmallVector<Value, 4> args;
-    args.reserve(yield_op->getNumOperands() + new_args.size());
+    auto loop_carried_yield_operands =
+        yield_op->getOperands().take_front(num_loop_carried);
+    args.reserve(loop_carried_yield_operands.size() + new_args.size());
     if (passthru_extra_args) {
       // Add operands of yield to the return, inserting casts if needed.
-      for (auto it : llvm::zip(yield_op->getOperands(), types)) {
+      for (auto it : llvm::zip_first(loop_carried_yield_operands, types)) {
         auto value = std::get<0>(it);
         auto type = std::get<1>(it);
         if (value.getType() == type) {
@@ -160,11 +195,6 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
   // Replace region with call to outline function.
   auto replace_with_call = [&](StringRef name, Region& region,
                                bool passthru_extra_args) {
-    // Skip if already only a call.
-    if (region.front().getOperations().size() == 2 &&
-        isa<mlir::CallOp>(region.front().front()))
-      return;
-
     auto func = create_outline_func(name, region, passthru_extra_args);
     OpBuilder b(region);
     // The body of the region is empty/has been outlined into the function.
@@ -185,19 +215,19 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
 
   // If there are extern values used then the result type of the while has to
   // change, so replace with new while op.
-  if (extern_values.empty()) return;
+  if (extra_operands.empty()) return;
 
   Operation* op = while_op.getOperation();
   SmallVector<Value, 4> operands;
   SmallVector<Type, 4> new_types;
-  operands.reserve(op->getNumOperands() + extern_values.size());
+  operands.reserve(types.size());
   new_types.reserve(operands.size());
   auto add_operand = [&](Value v) {
     operands.push_back(v);
     new_types.push_back(v.getType());
   };
   for (auto operand : op->getOperands()) add_operand(operand);
-  for (auto operand : extern_values) add_operand(operand);
+  for (auto operand : extra_operands) add_operand(operand);
 
   Operation* new_op = OpBuilder(op).insert(Operation::create(
       op->getLoc(), op->getName(), new_types, operands, op->getAttrs(),
@@ -212,7 +242,6 @@ void WhileOutlinePass::runOnModule() {
   getModule().walk(
       [&](mlir::TFL::WhileOp while_op) { OutlineWhile(while_op); });
 }
-}  // namespace
 
 // Creates an instance of the TensorFlow Lite dialect WhileOp outline pass.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateWhileOutlinePass() {
diff --git a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
index a9cc483df76..3d4bbdfa13c 100644
--- a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
@@ -38,7 +38,7 @@ FloatAttr GetSingleElementAsFloatOrSelf(Attribute attr) {
 
 IntegerAttr ExtractSingleElementAsInteger(ElementsAttr attr) {
   if (attr.getType().getNumElements() != 1 ||
-      !attr.getType().getElementType().isa<IntegerType>()) {
+      !attr.getType().getElementType().isSignlessInteger()) {
     return {};
   }
   SmallVector<uint64_t, 8> index(attr.getType().getRank(), 0);
diff --git a/tensorflow/compiler/mlir/lite/utils/attribute_utils.h b/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
index 5a11690d15f..7c0ff910db1 100644
--- a/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/attribute_utils.h
@@ -19,7 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_ATTRIBUTE_UTILS_H_
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index f7f77a53529..547a8c9d11e 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
@@ -70,14 +70,14 @@ Value CreateNoneValue(OpBuilder* builder, mlir::Location location) {
                                            builder->getUnitAttr());
 }
 
-Value Transpose2D(OpBuilder* builder, Value value_to_transpose,
-                  RankedTensorType type, mlir::Location location) {
+Value Transpose(OpBuilder* builder, Value value_to_transpose,
+                SmallVector<int64_t, 4> perm, RankedTensorType original_type,
+                mlir::Location location) {
   // Create a constant op for transpose permutation.
-  SmallVector<int64_t, 2> perm = {1, 0};
   auto perm_op = CreateI64DenseConst(builder, perm, perm, location);
 
   // Create tensor type for the transpose result.
-  auto transpose_type = type;
+  auto transpose_type = original_type;
   auto transpose_shape = functional::map(
       [transpose_type](int64_t dim) { return transpose_type.getDimSize(dim); },
       perm);
@@ -88,6 +88,13 @@ Value Transpose2D(OpBuilder* builder, Value value_to_transpose,
                                           value_to_transpose, perm_op);
 }
 
+Value Transpose2D(OpBuilder* builder, Value value_to_transpose,
+                  RankedTensorType type, mlir::Location location) {
+  // Create a constant op for transpose permutation.
+  SmallVector<int64_t, 4> perm = {1, 0};
+  return Transpose(builder, value_to_transpose, perm, type, location);
+}
+
 ArrayRef<int64_t> GetRankedTensorShape(Value value) {
   return value.getType().cast<RankedTensorType>().getShape();
 }
@@ -586,15 +593,30 @@ LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
   Value recurrent_kernel = func_op.getArgument(4);
   Value bias = func_op.getArgument(5);
 
-  // Assume it's batch majored.
+  // TFL lstm only supports time-majored inputs, so if it's not time-majored,
+  // we will transpose the inputs and outputs.
+  auto time_major_attr = func_op.getAttrOfType<BoolAttr>("tf.time_major");
+  if (time_major_attr == nullptr) return failure();
+
+  bool time_majored = time_major_attr.getValue();
   auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
   if (!input_type) {
     func_op.emitError() << "Input type is not a ranked tensor type";
     return failure();
   }
 
-  int batch = input_type.getDimSize(0);
-  int time = input_type.getDimSize(1);
+  auto final_inputs = input;
+  auto final_input_type = input_type;
+  // We will transpose the inputs.
+  if (!time_majored) {
+    SmallVector<int64_t, 4> perm = {1, 0, 2};
+    final_inputs =
+        Transpose(builder, final_inputs, perm, input_type, func_op.getLoc());
+    final_input_type = final_inputs.getType().dyn_cast<RankedTensorType>();
+  }
+
+  int batch = final_input_type.getDimSize(1);
+  int time = final_input_type.getDimSize(0);
 
   // Setup correct weights.
   RankedTensorType weight_type =
@@ -672,7 +694,13 @@ LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
       builder->getF32FloatAttr(10.0), builder->getF32FloatAttr(0.0),
       builder->getStringAttr("FULL"));
 
-  builder->create<mlir::ReturnOp>(func_op.getLoc(), lstm.getResult());
+  auto final_output = lstm.getResult();
+  if (!time_majored) {
+    SmallVector<int64_t, 4> perm = {1, 0, 2};
+    final_output =
+        Transpose(builder, final_output, perm, result_type, func_op.getLoc());
+  }
+  builder->create<mlir::ReturnOp>(func_op.getLoc(), final_output);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index b229206a4e4..0593bd150c7 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.cc b/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.cc
index a12cad15256..4067cfb04b9 100644
--- a/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h b/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
index 917ae93f6a8..635922d5cbb 100644
--- a/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/utils/validators.h b/tensorflow/compiler/mlir/lite/utils/validators.h
index e1ae4392881..fa1304c68e0 100644
--- a/tensorflow/compiler/mlir/lite/utils/validators.h
+++ b/tensorflow/compiler/mlir/lite/utils/validators.h
@@ -19,7 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_VALIDATORS_H_
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index f6a37c4a5f2..be51b5f151a 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -90,7 +90,7 @@ gentbl(
     td_file = "ir/tf_saved_model_ops.td",
     td_srcs = [
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
-        "@llvm-project//mlir:include/mlir/Dialect/StandardOps/Ops.td",
+        "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
 )
 
@@ -114,7 +114,7 @@ gentbl(
     td_file = "ir/tf_executor_ops.td",
     td_srcs = [
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
-        "@llvm-project//mlir:include/mlir/Dialect/StandardOps/Ops.td",
+        "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
 )
 
@@ -138,7 +138,7 @@ gentbl(
     td_file = "ir/tf_device_ops.td",
     td_srcs = [
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
-        "@llvm-project//mlir:include/mlir/Dialect/StandardOps/Ops.td",
+        "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
 )
 
@@ -281,12 +281,12 @@ cc_library(
         "transforms/generated_canonicalize.inc",
         "transforms/generated_optimize.inc",
         "transforms/graph_pruning.cc",
-        "transforms/inline_global_tensors.cc",
         "transforms/layout_optimization.cc",
         "transforms/mark_function_visibility.cc",
         "transforms/materialize_mlir_passthrough_op.cc",
         "transforms/optimize.cc",
         "transforms/optimize_global_tensors.cc",
+        "transforms/parallel_execute_to_islands.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/raise_control_flow.cc",
         "transforms/replicate_invariant_op_hoisting.cc",
@@ -376,6 +376,7 @@ cc_library(
         ":tensorflow",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LoopOpsTransforms",
     ],
     alwayslink = 1,
 )
@@ -1000,8 +1001,13 @@ cc_library(
     srcs = ["utils/tpu_rewrite_device_util.cc"],
     hdrs = ["utils/tpu_rewrite_device_util.h"],
     deps = [
+        "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
+        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
     ],
@@ -1016,6 +1022,7 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "@llvm-project//llvm:support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 4b6ff55e5ea..85d87a56f01 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/Dialect/Traits.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -573,9 +573,9 @@ void Print(SwitchNOp switchn, OpAsmPrinter &p) {
 
 ParseResult ParseSwitchNOp(OpAsmParser &parser, OperationState &result) {
   // Parsing:
-  //       %2:6 = tf_executor.SwitchN %0, %1 by 5 : tensor<??xf32>
+  //       %2:6 = tf_executor.SwitchN %0, %1 of 5 : tensor<??xf32>
   // Where the first operand is the data to replicate, the second is an i32
-  // indicating which output to populate, followed by the keyword `by` and the
+  // indicating which output to populate, followed by the keyword `of` and the
   // number of outputs (+1 for the control token).
   SmallVector<OpAsmParser::OperandType, 2> op_infos;
   SmallVector<Type, 1> types;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 0987ae3d668..38f72f24bd1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -165,7 +165,7 @@ def TfExecutor_IslandOp : TfExecutor_Op<"island",
     The `tf_executor.island` operation has a single region with a single block
     attached (only functional control flow is allowed). The block is terminated
     by a `tf_executor.yield` operation. The operands of the terminator
-    correspond to the result values of the `tf_executor.graph` operation. An
+    correspond to the result values of the `tf_executor.island` operation. An
     extra result of type `!tf_executor.control` is always produced by every
     `tf_executor.island`.
     Within an island, execution semantics follow standard sequential behavior as
@@ -299,7 +299,7 @@ def TfExecutor_SwitchNOp : TfExecutor_Op<"SwitchN",
         .SetShapeFn(SwitchNShape);
 
     For example:
-      %2:6 = tf_executor.SwitchN %0, %1 by 5 : tensor<??xf32>
+      %2:6 = tf_executor.SwitchN %0, %1 of 5 : tensor<??xf32>
 
     Note: One additional result corresponds to the control output.
   }];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 31e85ef247e..77e098c37e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -510,6 +510,7 @@ Broadcasting is supported, so `value` may have any number of dimensions.
     // TF_LayoutSensitiveInterface:
     SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
     SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult UpdateDataFormat(StringRef data_format);
   }];
 }
 
@@ -980,7 +981,7 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
   let hasCanonicalizer = 1;
 }
 
-def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect]> {
+def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect, TF_LayoutSensitiveInterface]> {
   let summary = [{
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.
   }];
@@ -1030,6 +1031,13 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   let verifier = [{
     return Verify(*this);
   }];
+
+  let extraClassDeclaration = [{
+    // TF_LayoutSensitiveInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
 }
 
 def TF_Conv2DBackpropFilterOp : TF_Op<"Conv2DBackpropFilter", [NoSideEffect]> {
@@ -2091,7 +2099,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<3>;
 }
 
-def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect]> {
+def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Batch normalization.";
 
   let description = [{
@@ -2122,6 +2130,13 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
 }
 
 def TF_GatherOp : TF_Op<"Gather", [NoSideEffect]> {
@@ -3096,6 +3111,70 @@ cublas.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [NoSideEffect, AllTypesMatch<["input", "band"]>]> {
+  let summary = [{
+Copy a tensor setting everything outside a central band in each innermost matrix
+to zero.
+  }];
+
+  let description = [{
+The `band` part is computed as follows:
+Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+tensor with the same shape where
+
+`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+
+The indicator function
+
+`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+                 (num_upper < 0 || (n-m) <= num_upper)`.
+
+For example:
+
+```
+# if 'input' is [[ 0,  1,  2, 3]
+                 [-1,  0,  1, 2]
+                 [-2, -1,  0, 1]
+                 [-3, -2, -1, 0]],
+
+tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+                                       [-1,  0,  1, 2]
+                                       [ 0, -1,  0, 1]
+                                       [ 0,  0, -1, 0]],
+
+tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+                                      [-1,  0,  1, 0]
+                                      [-2, -1,  0, 1]
+                                      [ 0, -2, -1, 0]]
+```
+
+Useful special cases:
+
+```
+ tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+ tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+ tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$num_lower,
+    TF_I32OrI64Tensor:$num_upper
+  );
+
+  let results = (outs
+    TF_Tensor:$band
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tindex = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_MatrixDiagOp : TF_Op<"MatrixDiag", [NoSideEffect]> {
   let summary = [{
 Returns a batched diagonal tensor with a given batched diagonal values.
@@ -4278,7 +4357,7 @@ This is the opposite of `unpack`.
   }];
 }
 
-def TF_PadOp : TF_Op<"Pad", [NoSideEffect]> {
+def TF_PadOp : TF_Op<"Pad", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Pads a tensor with zeros.";
 
   let description = [{
@@ -4317,6 +4396,13 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
 }
 
 def TF_PadV2Op : TF_Op<"PadV2", [NoSideEffect]> {
@@ -4845,7 +4931,7 @@ I.e., \\(y = 1 / x\\).
   let hasCanonicalizer = 1;
 }
 
-def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_LayoutAgnostic]> {
   let summary = "Computes rectified linear: `max(features, 0)`.";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index f3fdab674e4..92e6d522125 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -85,7 +85,7 @@ class TF_TensorFlowType <string name, string description> :
 
 // Any tensor element type allowed in TensorFlow ops
 def TF_ElementType : Type<Or<[AnyFloat.predicate,
-                              AnyInteger.predicate,
+                              AnySignlessInteger.predicate,
                               AnyComplex.predicate,
                               TF_TFDialectType.predicate]>,
                           "tf.dtype">;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index 8700247af43..cc0819d71c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -50,6 +50,12 @@ def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
       [{Returns indices of layout dependent results.}],
       "SmallVector<unsigned, 4>", "GetLayoutDependentResults", (ins)
     >,
+    InterfaceMethod<
+      [{Updates operation attributes and operands to account for the updated
+        data format. If data format is not supported, must return failure.}],
+      "LogicalResult", "UpdateDataFormat",
+      (ins "StringRef":$data_format)
+    >,
   ];
 
   let verify = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 57e16d91d69..8d4c284bcf8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/Dialect/Traits.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -151,26 +151,6 @@ static bool AreCastCompatible(Type a, Type b) {
          b_kind == TensorFlowTypes::VARIANT;
 }
 
-static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
-                                       DenseIntElementsAttr perm1) {
-  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
-  if (perm0.getNumElements() != perm1.getNumElements()) return false;
-
-  SmallVector<int64_t, 8> perm0_values;
-  for (auto value : perm0.getIntValues())
-    perm0_values.push_back(value.getSExtValue());
-
-  SmallVector<int64_t, 8> perm1_values;
-  for (auto value : perm1.getIntValues())
-    perm1_values.push_back(value.getSExtValue());
-
-  for (int i = 0; i < perm0_values.size(); ++i) {
-    if (perm0_values[perm1_values[i]] != i) return false;
-  }
-
-  return true;
-}
-
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -312,6 +292,164 @@ static LogicalResult VerifyTypesCompatibility(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TF op helper functions to work with layout transformation.
+//===----------------------------------------------------------------------===//
+
+SmallVector<int64_t, 4> ReversePermutation(ArrayRef<int64_t> permutation) {
+  SmallVector<int64_t, 4> reverse(permutation.size());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    reverse[permutation[i]] = i;
+  }
+  return reverse;
+}
+
+SmallVector<int64_t, 4> GetDataFormatPermutation(StringRef from, StringRef to) {
+  if (from == "NHWC" && to == "NCHW") {
+    return {0, 3, 1, 2};
+  } else if (from == "NCHW" && to == "NHWC") {
+    return {0, 2, 3, 1};
+  } else {
+    return {};
+  }
+}
+
+// Shuffle elements in the `attr` according to the permutation. Optional
+// `inner_size` allows to shuffle array attributes created from rank 2 tensors
+// on outer dimension only.
+ArrayAttr ShuffleArrayAttr(ArrayAttr attr, ArrayRef<int64_t> permutation,
+                           int inner_size = 1) {
+  if (attr.size() == 0) return attr;
+
+  assert(attr.size() % inner_size == 0);
+  assert(attr.size() / inner_size == permutation.size());
+
+  SmallVector<Attribute, 8> values{attr.begin(), attr.end()};
+  SmallVector<Attribute, 8> shuffled(values.size());
+
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    for (size_t j = 0; j < inner_size; ++j) {
+      shuffled[i * inner_size + j] = values[permutation[i] * inner_size + j];
+    }
+  }
+
+  return ArrayAttr::get(shuffled, attr.getContext());
+}
+
+// Shuffle ranked tensor dimensions according to the permutation.
+Type ShuffleRankedTensorType(Type type, ArrayRef<int64_t> permutation) {
+  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
+    ArrayRef<int64_t> shape = ranked_type.getShape();
+    assert(permutation.size() == shape.size());
+
+    SmallVector<int64_t, 4> new_shape(permutation.size());
+    for (size_t i = 0; i < permutation.size(); ++i)
+      new_shape[i] = shape[permutation[i]];
+
+    return RankedTensorType::get(new_shape, ranked_type.getElementType());
+  }
+
+  return type;
+}
+
+static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
+                                       DenseIntElementsAttr perm1) {
+  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
+  if (perm0.getNumElements() != perm1.getNumElements()) return false;
+
+  SmallVector<int64_t, 8> perm0_values;
+  for (auto value : perm0.getIntValues())
+    perm0_values.push_back(value.getSExtValue());
+
+  SmallVector<int64_t, 8> perm1_values;
+  for (auto value : perm1.getIntValues())
+    perm1_values.push_back(value.getSExtValue());
+
+  for (int i = 0; i < perm0_values.size(); ++i) {
+    if (perm0_values[perm1_values[i]] != i) return false;
+  }
+
+  return true;
+}
+
+// Default implementation of `LayoutSensitiveInterface::UpdateDataFormat` for
+// layout sensitive operations that do not have any additional layout dependent
+// attributes besides `data_format` string.
+template <typename Op>
+LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
+  auto perm = GetDataFormatPermutation(op->data_format(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data format attribute.
+  op->setAttr("data_format", StringAttr::get(data_format, op->getContext()));
+
+  // Update types for all layout sensitive results.
+  auto layout_sensitive = cast<LayoutSensitiveInterface>(op->getOperation());
+  for (unsigned idx : layout_sensitive.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(ShuffleRankedTensorType(result.getType(), perm));
+  }
+
+  return success();
+}
+
+// Default implementation for folding operand transpose into the operation.
+// See `FoldOperandsTransposeInterface::FoldOperandsPermutation`.
+template <typename Op>
+LogicalResult FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation, Op *op,
+    ArrayRef<std::pair<StringRef, ArrayAttr>> shuffle_attrs = {}) {
+  MLIRContext *context = op->template getParentOfType<ModuleOp>().getContext();
+
+  // We only support NHWC <-> NCHW permutations.
+  static constexpr std::array<int64_t, 4> kNchwToNhwc = {0, 2, 3, 1};
+  static constexpr std::array<int64_t, 4> kNhwcToNchw = {0, 3, 1, 2};
+
+  // Operation data format after folding `permutation`.
+  StringRef target_data_format = [&]() -> StringRef {
+    if (op->data_format() == "NHWC" && permutation.equals(kNchwToNhwc)) {
+      return "NCHW";  // cancel NCHW->NHWC operand permutation
+    } else if (op->data_format() == "NCHW" && permutation.equals(kNhwcToNchw)) {
+      return "NHWC";  // cancel NHWC->NCHW operand permutation
+    } else {
+      return "";
+    }
+  }();
+  if (target_data_format.empty()) return failure();
+
+  // To fold operand `permutation` into the `op` we need shuffle all layout
+  // dependent attributes and types with a reverse permutation, and change
+  // operation data format to `target_data_format`.
+  //
+  // Example:
+  //   %1 = SomeOp(...)   {data_format = NHWC}
+  //   %2 = Transpose(%1) {permutation = NHWC->NCHW}
+  //   %3 = Op(%2)        {data_format = NCHW}
+  //
+  // To bypass %2 we have to change data format to shuffle data format from NCHW
+  // to NHWC, which is the reverse of operand permutation (function argument).
+  auto reverse_permutation =
+      GetDataFormatPermutation(op->data_format(), target_data_format);
+  if (reverse_permutation.empty()) return failure();
+
+  op->setAttr("data_format", StringAttr::get(target_data_format, context));
+
+  for (auto pair : shuffle_attrs) {
+    StringRef attr_name = pair.first;
+    ArrayAttr attr_value = pair.second;
+    op->setAttr(attr_name, ShuffleArrayAttr(attr_value, reverse_permutation));
+  }
+
+  auto fold = cast<FoldOperandsTransposeInterface>(op->getOperation());
+  for (unsigned idx : fold.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(
+        ShuffleRankedTensorType(result.getType(), reverse_permutation));
+  }
+
+  return success();
+}
+
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
@@ -479,6 +617,15 @@ static LogicalResult Verify(BiasAddOp op) {
   return success();
 }
 
+// TODO(ezhulenev): BiasAddOp is not really layout sensitive, it must only
+// support folding operand transposes.
+LogicalResult BiasAddOp::UpdateDataFormat(StringRef data_format) {
+  auto ranked = value().getType().dyn_cast<RankedTensorType>();
+  if (!ranked || ranked.getRank() != 4) return failure();
+
+  return ::mlir::TF::UpdateDataFormat(data_format, this);
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddGradOp
 //===----------------------------------------------------------------------===//
@@ -837,6 +984,21 @@ static LogicalResult Verify(OpT op) {
   return success();
 }
 
+LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
+  auto perm = GetDataFormatPermutation(this->data_format(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data_format attribute and result types.
+  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
+
+  // Update convolution attributes.
+  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
+  setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Conv2dBackpropInputOp
 //===----------------------------------------------------------------------===//
@@ -1158,6 +1320,11 @@ static LogicalResult Verify(FusedBatchNormOp op) {
   return success();
 }
 
+LogicalResult FusedBatchNormV3Op::FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation) {
+  return ::mlir::TF::FoldOperandsPermutation(permutation, this);
+}
+
 //===----------------------------------------------------------------------===//
 // GatherV2Op
 //===----------------------------------------------------------------------===//
@@ -1339,6 +1506,29 @@ void LogicalNotOp::getCanonicalizationPatterns(
                  LogicalNotOfLess, LogicalNotOfLessEqual>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// MatrixBandPartOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(MatrixBandPartOp op) {
+  if (!HasRankAtLeast(op.input(), 2)) {
+    return op.emitOpError()
+           << "requires `input` to have rank of at least 2, but found "
+           << op.input().getType();
+  }
+  if (!IsOfRankOrUnranked(op.num_lower(), 0)) {
+    return op.emitOpError()
+           << "requires `num_lower` to have 0 dimensions, but found "
+           << op.num_lower().getType();
+  }
+  if (!IsOfRankOrUnranked(op.num_upper(), 0)) {
+    return op.emitOpError()
+           << "requires `num_upper` to have 0 dimensions, but found "
+           << op.num_upper().getType();
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MaxOp
 //===----------------------------------------------------------------------===//
@@ -1356,57 +1546,8 @@ void MaxOp::build(Builder *builder, OperationState &result, Value input,
 
 LogicalResult MaxPoolOp::FoldOperandsPermutation(
     ArrayRef<int64_t> permutation) {
-  MLIRContext *context = getParentOfType<ModuleOp>().getContext();
-
-  // For now we only support folding of NCHW->NHWC and NHWC->NCHW permutations.
-  if (data_format() == "NHWC") {
-    static constexpr std::array<int64_t, 4> kPerm = {0, 2, 3, 1};  // to NHWC
-    if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
-
-    setAttr("data_format", StringAttr::get("NCHW", context));
-
-  } else if (data_format() == "NCHW") {
-    static constexpr std::array<int64_t, 4> kPerm = {0, 3, 1, 2};  // to NCHW
-    if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
-
-    setAttr("data_format", StringAttr::get("NHWC", context));
-
-  } else {
-    return failure();
-  }
-
-  auto shuffle_attr = [&](ArrayAttr attr) -> ArrayAttr {
-    SmallVector<Attribute, 4> values{attr.begin(), attr.end()};
-    SmallVector<Attribute, 4> shuffled(values.size());
-
-    for (size_t i = 0; i < permutation.size(); ++i)
-      shuffled[permutation[i]] = values[i];
-
-    return ArrayAttr::get(shuffled, context);
-  };
-
-  setAttr("strides", shuffle_attr(strides()));
-  setAttr("ksize", shuffle_attr(ksize()));
-
-  auto shuffle_type = [&](Type type) -> Type {
-    if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
-      ArrayRef<int64_t> shape = ranked_type.getShape();
-      assert(permutation.size() == shape.size());
-
-      SmallVector<int64_t, 4> new_shape(permutation.size());
-      for (size_t i = 0; i < permutation.size(); ++i)
-        new_shape[permutation[i]] = shape[i];
-
-      return RankedTensorType::get(new_shape, ranked_type.getElementType());
-    }
-
-    return type;
-  };
-
-  OpResult result = getOperation()->getResult(0);
-  result.setType(shuffle_type(result.getType()));
-
-  return success();
+  return ::mlir::TF::FoldOperandsPermutation(
+      permutation, this, {{"strides", strides()}, {"ksize", ksize()}});
 }
 
 //===----------------------------------------------------------------------===//
@@ -1426,6 +1567,38 @@ static LogicalResult Verify(MaxPoolGradOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// MeanOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
+  // Reduction indices must be defined by a constant operation.
+  auto reduction_op =
+      dyn_cast_or_null<TF::ConstOp>(reduction_indices().getDefiningOp());
+  if (!reduction_op) return failure();
+
+  auto reductions_value = reduction_op.value().dyn_cast<DenseElementsAttr>();
+  if (!reductions_value) return failure();
+
+  // Prepare new reduction indices according to operand permutation.
+  SmallVector<int32_t, 4> shuffled_reduction;
+  llvm::transform(reductions_value.getIntValues(),
+                  std::back_inserter(shuffled_reduction),
+                  [&](APInt idx) { return permutation[idx.getSExtValue()]; });
+
+  // Add constant operation with a new reduction indices.
+  OpBuilder builder(getOperation());
+  auto type = mlir::RankedTensorType::get(shuffled_reduction.size(),
+                                          builder.getIntegerType(32));
+  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_reduction);
+  auto shuffled_reduction_op = builder.create<TF::ConstOp>(getLoc(), values);
+
+  // Use new reduction indices.
+  setOperand(1, shuffled_reduction_op);
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // NegOp
 //===----------------------------------------------------------------------===//
@@ -1568,6 +1741,46 @@ static LogicalResult Verify(PackOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// PadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
+  // Paddings must be defined by a constant operation.
+  auto paddings_op = dyn_cast_or_null<TF::ConstOp>(paddings().getDefiningOp());
+  if (!paddings_op) return failure();
+
+  auto paddings_value = paddings_op.value().dyn_cast<DenseElementsAttr>();
+  if (!paddings_value ||
+      paddings_value.getNumElements() != permutation.size() * 2)
+    return failure();
+
+  SmallVector<int32_t, 8> shuffled_paddings(paddings_value.getNumElements());
+  for (auto index_pair : llvm::enumerate(paddings_value.getIntValues())) {
+    size_t outer_idx = index_pair.index() / 2;
+    size_t inner_idx = index_pair.index() % 2;
+
+    shuffled_paddings[permutation[outer_idx] * 2 + inner_idx] =
+        index_pair.value().getSExtValue();
+  }
+
+  // Add constant operation with a new paddings.
+  OpBuilder builder(getOperation());
+  auto type = mlir::RankedTensorType::get(paddings_value.getType().getShape(),
+                                          builder.getIntegerType(32));
+  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_paddings);
+  auto shuffled_paddings_op = builder.create<TF::ConstOp>(getLoc(), values);
+
+  // Use new paddings.
+  setOperand(1, shuffled_paddings_op);
+
+  // Change the result type.
+  getResult().setType(ShuffleRankedTensorType(getResult().getType(),
+                                              ReversePermutation(permutation)));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ParseExampleV2Op
 //===----------------------------------------------------------------------===//
@@ -1914,7 +2127,8 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
   }
 
   Type element_type = result_ranked_type.getElementType();
-  if (!element_type.isInteger(32) && !element_type.isInteger(64))
+  if (!element_type.isSignlessInteger(32) &&
+      !element_type.isSignlessInteger(64))
     return op->emitOpError("requires int32 or int64 return type for result")
            << variadic_idx_str;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index b391d5284a5..e95fcbbdad3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -172,7 +172,7 @@ else_branch: A function that takes 'inputs' and returns a list of
   }];
 }
 
-def TF_MeanOp : TF_Op<"Mean", [NoSideEffect]> {
+def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Computes the mean of elements across dimensions of a tensor.";
 
   let description = [{
@@ -195,6 +195,13 @@ retained with length 1.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
 }
 
 def TF_LegacyCallOp : TF_Op<"LegacyCall",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 8d3253ef81f..ea46662bace 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -112,24 +112,20 @@ static LogicalResult VerifyIndexPath(Operation *op, NamedAttribute named_attr) {
   return mlir::success();
 }
 
+Type GetBoundInputArgTypeFor(GlobalTensorOp global_tensor) {
+  auto type = global_tensor.type().cast<TensorType>();
+  return RankedTensorType::get(
+      {}, TF::ResourceType::get({type}, type.getContext()));
+}
+
 static LogicalResult VerifyBoundInputArgType(Operation *op_for_diagnostics,
                                              Type arg_type,
                                              GlobalTensorOp global_tensor) {
-  if (global_tensor.is_mutable()) {
-    auto expected_type = RankedTensorType::get(
-        {}, TF::ResourceType::get({global_tensor.type().cast<TensorType>()},
-                                  arg_type.getContext()));
-    if (arg_type != expected_type) {
-      return op_for_diagnostics->emitError()
-             << "mutable bound input with type " << arg_type
-             << " expected to have type " << expected_type;
-    }
-  } else {
-    if (arg_type != global_tensor.type()) {
-      return op_for_diagnostics->emitError()
-             << "bound input for immutable 'tf_saved_model.global_tensor' must "
-                "match the global tensor's type";
-    }
+  auto expected_type = GetBoundInputArgTypeFor(global_tensor);
+  if (arg_type != expected_type) {
+    return op_for_diagnostics->emitError()
+           << "bound input with type " << arg_type << " expected to have type "
+           << expected_type;
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index 6f4b2061628..e93293741f4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -57,6 +57,10 @@ bool HasTfSavedModelSemantics(ModuleOp module);
 GlobalTensorOp LookupBoundInput(FuncOp func, int arg_index,
                                 const SymbolTable &symbol_table);
 
+// Gets the type that an exported function arg that is bound to `global_tensor`
+// should have.
+Type GetBoundInputArgTypeFor(GlobalTensorOp global_tensor);
+
 }  // namespace tf_saved_model
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index 6115dac8e03..4059aba209f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -91,7 +91,7 @@ class TensorFlowType : public Type {
 // Returns true if the specified type is a valid TensorFlow element type.
 static inline bool IsValidTFElementType(Type type) {
   return type.isa<ComplexType>() || type.isa<FloatType>() ||
-         type.isa<IntegerType>() || type.isa<TensorFlowType>();
+         type.isSignlessInteger() || type.isa<TensorFlowType>();
 }
 
 // Returns true if this is a valid TensorFlow tensor type.
@@ -141,20 +141,16 @@ class TensorFlowRefType : public TensorFlowType {
   static TensorFlowType get(Type type);
   static TensorFlowType getChecked(Type type, MLIRContext* context,
                                    Location loc) {
-    if (failed(verifyConstructionInvariants(loc, context, type))) {
+    if (failed(verifyConstructionInvariants(loc, type))) {
       return TensorFlowRefType();
     }
     return get(type);
   }
 
-  static LogicalResult verifyConstructionInvariants(
-      llvm::Optional<Location> loc, MLIRContext* context, Type type) {
+  static LogicalResult verifyConstructionInvariants(Location loc, Type type) {
     // type should be a valid TensorFlow type.
     if (!IsValidTFTensorType(type)) {
-      if (loc) {
-        emitError(*loc) << "invalid TensorFlow type: " << type;
-      }
-      return failure();
+      return emitError(loc) << "invalid TensorFlow type: " << type;
     }
     return success();
   }
@@ -230,7 +226,7 @@ class TypeWithSubtypeImpl
 
   static Derived getChecked(ArrayRef<TensorType> subtypes, MLIRContext* context,
                             Location loc) {
-    return Base::getChecked(loc, context, Derived::getTypeKind(), subtypes);
+    return Base::getChecked(loc, Derived::getTypeKind(), subtypes);
   }
 
   static Derived get(MLIRContext* context) { return get({}, context); }
@@ -239,16 +235,12 @@ class TypeWithSubtypeImpl
   static bool kindof(unsigned kind) { return kind == Derived::getTypeKind(); }
 
   static LogicalResult verifyConstructionInvariants(
-      llvm::Optional<Location> loc, MLIRContext* context,
-      ArrayRef<TensorType> subtypes) {
+      Location loc, ArrayRef<TensorType> subtypes) {
     // Each of the subtypes should be a valid TensorFlow type.
     for (TensorType subtype : subtypes) {
       if (!IsValidTFTensorType(subtype)) {
-        if (loc) {
-          emitError(*loc) << "invalid " << Derived::getTypeName()
-                          << " subtype: " << subtype;
-        }
-        return failure();
+        return emitError(loc) << "invalid " << Derived::getTypeName()
+                              << " subtype: " << subtype;
       }
     }
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index 8659f52e301..61e0772726c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -280,3 +280,67 @@ func @empty_island_multiple_data_results(%arg0: tensor<*xf32>, %arg1: tensor<*xi
   }
   return
 }
+
+// The following tests check that certain control dependencies between islands
+// and certain tf_executor ops are added correctly.
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.NextIteration.Sink [{{.*}}] {{.*}}, %[[CONTROL]]
+func @next_iteration_sink_control_input() {
+  tf_executor.graph {
+    %source:3 = tf_executor.NextIteration.Source : tensor<*xi32>
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+      %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
+      tf_executor.yield %const : tensor<*xi32>
+    }
+    tf_executor.NextIteration.Sink[%source#1] %island#0 : tensor<*xi32>
+    tf_executor.fetch %island#0 : tensor<*xi32>
+  }
+  return
+}
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.LoopCond {{.*}}, %[[CONTROL]]
+func @loop_cond_control_input() {
+  tf_executor.graph {
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i1>} : () -> tensor<*xi1>
+      %print = "tf.Print"(%const) : (tensor<*xi1>) -> (tensor<*xi1>)
+      tf_executor.yield %const : tensor<*xi1>
+    }
+    %loop_cond:2 = tf_executor.LoopCond %island#0 : tensor<*xi1>
+    tf_executor.fetch %loop_cond#0 : tensor<*xi1>
+  }
+  return
+}
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.Enter {{.*}}, %[[CONTROL]]
+func @enter_control_input() {
+  tf_executor.graph {
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+      %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
+      tf_executor.yield %const : tensor<*xi32>
+    }
+    %enter:2 = tf_executor.Enter %island#0 frame "some/frame" : tensor<*xi32>
+    tf_executor.fetch %enter#0 : tensor<*xi32>
+  }
+  return
+}
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.SwitchN {{.*}}, {{.*}} of {{[0-9]*}} (%[[CONTROL]])
+func @switchn_control_input(%arg1: tensor<i32>) {
+  tf_executor.graph {
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+      %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
+      tf_executor.yield %const : tensor<*xi32>
+    }
+    %switchn:4 = tf_executor.SwitchN %island#0, %arg1 of 3: tensor<*xi32>
+    tf_executor.fetch %switchn#0 : tensor<*xi32>
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir b/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir
index 1f1e6c63f30..6971cf06648 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir
@@ -9,5 +9,7 @@ func @device_test(%arg0: tensor<3x1xf32>) -> (tensor<3x3xf32>) {
   %1 = "tf.MatMul"(%arg0, %0) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
   // CHECK: device = "cpu"
   %2 = "tf.Relu"(%1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
+  // CHECK: device = "gpu"
+  %3 = "tf.Relu"(%2) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"]} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+  return %3 : tensor<3x3xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
similarity index 57%
rename from tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
index e8d667aea0f..0610cbe8680 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
@@ -6,10 +6,10 @@ func @transposeBiasAdd(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tens
   // Check that BiasAdd was converted to forced data format, and layout
   // dependent arguments and results passed through transpose nodes.
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%[[ARG_TRANSPOSE]], %arg1) {data_format = "NCHW"} {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[BIAS_ADD]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NHWC"} : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
@@ -20,10 +20,10 @@ func @transposeBiasAdd(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tens
 // CHECK-LABEL: func @transposeBiasAddWithDefaultAttr
 func @transposeBiasAddWithDefaultAttr(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%[[ARG_TRANSPOSE]], %arg1) {data_format = "NCHW"} {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[BIAS_ADD]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
@@ -38,4 +38,38 @@ func @transposeBiasWithUnknownShape(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8x
   %0 = "tf.BiasAdd"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<*xf32>
 
   return %0 : tensor<*xf32>
-}
\ No newline at end of file
+}
+
+// CHECK-LABEL: func @transposeConv2D
+func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32> {
+
+  // IMPORTANT: Tensor shapes do not match convolution parameters (stride,
+  // dilations, etc...). This test only verifies that changing convolution data
+  // layout will update all the attributes.
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+
+  // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  // CHECK-SAME: dilations = [1, 4, 2, 3]
+  // CHECK-SAME: explicit_paddings = [1, 2, 7, 8, 3, 4, 5, 6]
+  // CHECK-SAME: padding = "EXPLICIT"
+  // CHECK-SAME: strides = [5, 8, 6, 7]
+  // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32>
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NHWC",
+         dilations = [1, 2, 3, 4],
+         explicit_paddings = [1, 2, 3, 4, 5, 6, 7, 8],
+         padding = "EXPLICIT",
+         strides = [5, 6, 7, 8]
+       } : (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32>
+
+  return %0 : tensor<1x32x32x8xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
new file mode 100644
index 00000000000..0ed7b833158
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
@@ -0,0 +1,35 @@
+// RUN: tf-opt %s -tf-layout-assignment=force-data-format=NHWC -verify-diagnostics | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @transposeConv2D
+func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32> {
+
+  // IMPORTANT: Tensor shapes do not match convolution parameters (stride,
+  // dilations, etc...). This test only verifies that changing convolution data
+  // layout will update all the attributes.
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+
+  // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
+  // CHECK-SAME: data_format = "NHWC"
+  // CHECK-SAME: dilations = [1, 3, 4, 2]
+  // CHECK-SAME: explicit_paddings = [1, 2, 5, 6, 7, 8, 3, 4]
+  // CHECK-SAME: padding = "EXPLICIT"
+  // CHECK-SAME: strides = [5, 7, 8, 6]
+  // CHECK-SAME: (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32>
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NCHW",
+         dilations = [1, 2, 3, 4],
+         explicit_paddings = [1, 2, 3, 4, 5, 6, 7, 8],
+         padding = "EXPLICIT",
+         strides = [5, 6, 7, 8]
+       } : (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32>
+
+  return %0 : tensor<1x8x32x32xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
index adb9059256c..0b1e27733eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
@@ -3,14 +3,14 @@
 // CHECK-LABEL: func @move_across_single_op
 func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: return %[[TANH]]
 
   %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
-  %1 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %2 = "tf.Transpose"(%0, %1) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %1 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.Transpose"(%0, %1) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
 
   return %2 : tensor<1x8x4x4xf32>
 }
@@ -18,17 +18,17 @@ func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 // CHECK-LABEL: func @move_across_multiple_ops
 func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
-  // CHECK: %[[TANH0:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: %[[TANH1:[0-9]*]] = "tf.Tanh"(%[[TANH0]]) {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: return %[[TANH1]]
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[TANH]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[RELU]]
 
   %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
-  %1 = "tf.Tanh"(%0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.Relu"(%0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
 
-  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
 }
@@ -36,15 +36,15 @@ func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32
 // CHECK-LABEL: func @move_across_multi_operand_op
 func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG0_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[ARG1_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg1, %[[ARG_PERM]])
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[ARG0_TRANSPOSE]], %[[ARG1_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: return %[[ADD]]
 
   %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
-  %1 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %2 = "tf.Transpose"(%0, %1) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %1 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.Transpose"(%0, %1) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
 
   return %2 : tensor<1x8x4x4xf32>
 }
@@ -52,7 +52,7 @@ func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4
 // CHECK-LABEL: func @move_with_multiple_uses
 func @move_with_multiple_uses(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[TANH]], %[[TANH]]) {{.*}} tensor<1x8x4x4xf32>
@@ -60,8 +60,8 @@ func @move_with_multiple_uses(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
 
   %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
   %1 = "tf.AddV2"(%0, %0) : (tensor<1x4x4x8xf32>, tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
-  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
index 10fc70683b3..5f138d749a2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -3,13 +3,13 @@
 // CHECK-LABEL: func @move_across_single_op
 func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[TANH]], %[[RES_PERM]]) {{.*}} tensor<1x8x4x4xf32>
   // CHECK: return %[[RES_TRANSPOSE]]
 
-  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
   %2 = "tf.Tanh"(%1) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
 
   return %2 : tensor<1x8x4x4xf32>
@@ -18,16 +18,16 @@ func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 // CHECK-LABEL: func @move_across_multiple_ops
 func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
-  // CHECK: %[[TANH0:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
-  // CHECK: %[[TANH1:[0-9]*]] = "tf.Tanh"(%[[TANH0]]) {{.*}} tensor<1x4x4x8xf32>
-  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[TANH1]], %[[RES_PERM]])
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[TANH]]) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[RELU]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
-  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
   %2 = "tf.Tanh"(%1) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
-  %3 = "tf.Tanh"(%2) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Relu"(%2) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
 }
@@ -35,14 +35,14 @@ func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32
 // CHECK-LABEL: func @move_across_multi_operand_op
 func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %arg1) {{.*}} tensor<1x4x4x8xf32>
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
-  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
-  %2 = "tf.Transpose"(%arg1, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  %2 = "tf.Transpose"(%arg1, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
   %3 = "tf.AddV2"(%1, %2) : (tensor<1x8x4x4xf32>, tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
@@ -54,14 +54,14 @@ func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56x64xf
   // MaxPool operand transpose must be folded into the op and MaxPool
   // must use NCHW data format with updated kernel size and strides.
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
   // CHECK: %[[MAX_POOL:[0-9]*]] = "tf.MaxPool"(%arg0) {data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "SAME", strides = [1, 1, 2, 2]} : (tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
-  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[MAX_POOL]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
   // Transpose NCHW -> NHWC
-  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi64>) -> tensor<1x112x112x64xf32>
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32>
 
   // Compute MaxPool in NHWC format
   %2 = "tf.MaxPool"(%1)
@@ -72,3 +72,49 @@ func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56x64xf
 
   return %2 : tensor<1x56x56x64xf32>
 }
+
+// CHECK-LABEL: func @fold_into_mean
+func @fold_into_mean(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64xf32> {
+
+  // CHECK: %[[RED_IDX:[0-9]*]] = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>}
+  // CHECK: %[[MEAN:[0-9]*]] = "tf.Mean"(%arg0, %[[RED_IDX]])
+  // CHECK-SAME: (tensor<1x64x112x112xf32>, tensor<2xi32>) -> tensor<1x64xf32>
+  // CHECK: return %[[MEAN]]
+
+  // Transpose NCHW -> NHWC
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32>
+
+  // Compute Mean over spatial dimensions in NHWC format.
+  %2 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %3 = "tf.Mean"(%1, %2) : (tensor<1x112x112x64xf32>, tensor<2xi32>) -> tensor<1x64xf32>
+
+  return %3 : tensor<1x64xf32>
+}
+
+// CHECK-LABEL: func @fold_into_fused_batch_norm
+func @fold_into_fused_batch_norm(%arg0: tensor<1x64x112x112xf32>, %arg1: tensor<64xf32>) -> tensor<1x112x112x64xf32> {
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: "tf.FusedBatchNormV3"(%arg0, {{.*}} {data_format = "NCHW"
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  // Transpose NCHW -> NHWC
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32>
+
+  // Compute FusedBatchNormV3 in NHWC format
+  %2, %batch_mean, %batch_var, %reserve_1, %reserve_2, %reserve_3
+    = "tf.FusedBatchNormV3"(%1, %arg1, %arg1, %arg1, %arg1)
+       {
+         data_format = "NHWC",
+         epsilon = 1.001 : f32,
+         exponential_avg_factor = 1.0 : f32,
+         is_training = false
+       }
+        : (tensor<1x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+       -> (tensor<1x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+
+  return %2#0 : tensor<1x112x112x64xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir
similarity index 79%
rename from tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir
index 44330d675e2..a2394cd93c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir
@@ -4,15 +4,15 @@
 func @transposeBiasAdd(%arg0: tensor<1x8x4x4xf32>, %arg1: tensor<8xf32>) -> tensor<1x8x4x4xf32> {
 
   // Convert input: NCHW -> NHWC
-  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x8x4x4xf32>, tensor<4xi64>) -> tensor<1x4x4x8xf32>
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
 
   // Compute in NHWC
   %2 = "tf.BiasAdd"(%1, %arg1) {data_format = "NHWC"} : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
 
   // Convert result back: NHWC -> NCHW
-  %3 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
-  %4 = "tf.Transpose"(%2, %3) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %4 = "tf.Transpose"(%2, %3) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
 
   // Check that BiasAdd computed in NCHW format, and all redundant transpose
   // operations removed from the function.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir
new file mode 100644
index 00000000000..85b4e3671ac
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nhwc.mlir
@@ -0,0 +1,156 @@
+// RUN: tf-opt %s -tf-layout-optimization=force-data-format=NHWC -verify-diagnostics | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @transpose_resnet_layer
+func @transpose_resnet_layer(%arg0: tensor<?x224x224x3xf32>, // input
+                             %arg1: tensor<64xf32>,          // batch_norm args
+                             %arg2: tensor<256xf32>,          // batch_norm args
+                             %arg3: tensor<7x7x3x64xf32>,    // conv filter #0
+                             %arg4: tensor<1x1x64x256xf32>   // conv filter #1
+                            ) -> tensor<?x256xf32> {
+
+  // This is a simplified ResNet layer that gets input in NHWC format, converts
+  // it to NCHW before padding, and does all computations in NCHW (this is the
+  // default setup for ResNet model trained in fp32 on GPU).
+  //
+  // To be able to use Tensor Cores on latest NVIDIA GPUs this model has to be
+  // converted to NHWC data format.
+
+  // Padding in spatial dimension (NCHW)
+  %0 = "tf.Const"() {value = dense<[[0, 0], [0, 0], [3, 3], [3, 3]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+
+  // Reduce over spatial dimensions (NCHW)
+  %1 = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>} : () -> tensor<2xi32>
+
+  // Transpose input: NHWC -> NCHW
+  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %3 = "tf.Transpose"(%arg0, %2) : (tensor<?x224x224x3xf32>, tensor<4xi32>) -> tensor<?x3x224x224xf32>
+
+  // Pad spatial dimensions.
+  %4 = "tf.Pad"(%3, %0) : (tensor<?x3x224x224xf32>, tensor<4x2xi32>) -> tensor<?x3x230x230xf32>
+
+  // Shuffled paddings.
+  // CHECK: %[[PADDINGS:[0-9]*]] = "tf.Const"(){{.*}}[0, 0], [3, 3], [3, 3], [0, 0]
+
+  // Pad input with new paddings.
+  // CHECK: %[[PAD:[0-9]*]] = "tf.Pad"(%arg0, %[[PADDINGS]])
+  // CHECK-SAME: (tensor<?x224x224x3xf32>, tensor<4x2xi32>) -> tensor<?x230x230x3xf32>
+
+  // ------------------------------------------------------------------------ //
+  // Convolution layer #0.
+  // ------------------------------------------------------------------------ //
+  %5 = "tf.Conv2D"(%4, %arg3)
+        {
+          data_format = "NCHW",
+          dilations = [1, 1, 1, 1],
+          explicit_paddings = [],
+          padding = "VALID",
+          strides = [1, 1, 2, 2]
+        } : (tensor<?x3x230x230xf32>, tensor<7x7x3x64xf32>) -> tensor<?x64x112x112xf32>
+
+  // CHECK: %[[CONV0:[0-9]*]] = "tf.Conv2D"
+  // CHECK-SAME %[[PAD]]
+  // CHECK-SAME: data_format = "NHWC"
+  // CHECK-SAME: strides = [1, 2, 2, 1]
+
+  %6, %batch_mean, %batch_variance, %reserved_1, %reserved_2, %reserved_3 =
+       "tf.FusedBatchNormV3"(%5, %arg1, %arg1, %arg1, %arg1)
+       {
+         data_format = "NCHW",
+         epsilon = 1.001000e-05 : f32,
+         is_training = false
+       } : (tensor<?x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+        -> (tensor<?x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>)
+
+  // CHECK: "tf.FusedBatchNormV3"
+  // CHECK-SAME: data_format = "NHWC"
+
+  %7 = "tf.Relu"(%6) : (tensor<?x64x112x112xf32>) -> tensor<?x64x112x112xf32>
+  %8 = "tf.MaxPool"(%7)
+       {
+         data_format = "NCHW",
+         ksize = [1, 1, 3, 3],
+         padding = "SAME",
+         strides = [1, 1, 2, 2]
+       } : (tensor<?x64x112x112xf32>) -> tensor<?x64x56x56xf32>
+
+  // CHECK: %[[MAX_POOL:[0-9]*]] = "tf.MaxPool"
+  // CHECK-SAME: data_format = "NHWC"
+  // CHECK-SAME: ksize = [1, 3, 3, 1]
+  // CHECK-SAME: strides = [1, 2, 2, 1]
+
+  // ------------------------------------------------------------------------ //
+  // Convolution layer #1.
+  // ------------------------------------------------------------------------ //
+  %9 = "tf.Conv2D"(%8, %arg4)
+       {
+         data_format = "NCHW",
+         dilations = [1, 1, 1, 1],
+         explicit_paddings = [],
+         padding = "VALID",
+         strides = [1, 1, 1, 1]
+       } : (tensor<?x64x56x56xf32>, tensor<1x1x64x256xf32>) -> tensor<?x256x56x56xf32>
+
+  // CHECK: %[[CONV1:[0-9]*]] = "tf.Conv2D"(%[[MAX_POOL]], %arg4)
+  // CHECK-SAME: data_format = "NHWC"
+
+  %10, %batch_mean_1, %batch_variance_1, %reserved_1_1, %reserved_1_2, %reserved_1_3 =
+       "tf.FusedBatchNormV3"(%9, %arg2, %arg2, %arg2, %arg2)
+       {
+         data_format = "NCHW",
+         epsilon = 1.001000e-05 : f32
+       } : (tensor<?x256x56x56xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>)
+        -> (tensor<?x256x56x56xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<*xf32>)
+
+  // CHECK: %[[BATCH_NORM1:[_a-z0-9]*]], {{.*}} = "tf.FusedBatchNormV3"
+  // CHECK-SAME: %[[CONV1]]
+  // CHECK-SAME: data_format = "NHWC"
+
+  // ------------------------------------------------------------------------ //
+  // Convolution layer #2.
+  // ------------------------------------------------------------------------ //
+  %11 = "tf.Conv2D"(%8, %arg4)
+       {
+         data_format = "NCHW",
+         dilations = [1, 1, 1, 1],
+         explicit_paddings = [],
+         padding = "VALID",
+         strides = [1, 1, 1, 1]
+       } : (tensor<?x64x56x56xf32>, tensor<1x1x64x256xf32>) -> tensor<?x256x56x56xf32>
+
+  // CHECK: %[[CONV2:[0-9]*]] = "tf.Conv2D"(%[[MAX_POOL]], %arg4)
+  // CHECK-SAME: data_format = "NHWC"
+
+  %12, %batch_mean_2, %batch_variance_2, %reserved_2_1, %reserved_2_2, %reserved_2_3 =
+       "tf.FusedBatchNormV3"(%11, %arg2, %arg2, %arg2, %arg2)
+       {
+         data_format = "NCHW",
+         epsilon = 1.001000e-05 : f32
+       } : (tensor<?x256x56x56xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>)
+        -> (tensor<?x256x56x56xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<*xf32>)
+
+  // CHECK: %[[BATCH_NORM2:[_a-z0-9]*]], {{.*}} = "tf.FusedBatchNormV3"
+  // CHECK-SAME: %[[CONV2]]
+  // CHECK-SAME: data_format = "NHWC"
+
+  // ------------------------------------------------------------------------ //
+  // Add results of convolution layers #1 and #2.
+  // ------------------------------------------------------------------------ //
+
+  %14 = "tf.AddV2"(%10, %12) : (tensor<?x256x56x56xf32>, tensor<?x256x56x56xf32>) -> tensor<?x256x56x56xf32>
+  %15 = "tf.Relu"(%14) : (tensor<?x256x56x56xf32>) -> tensor<?x256x56x56xf32>
+
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[BATCH_NORM1]], %[[BATCH_NORM2]])
+  // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[ADD]])
+
+  // Reduce spatial dimensions
+  %16 = "tf.Mean"(%15, %1) : (tensor<?x256x56x56xf32>, tensor<2xi32>) -> tensor<?x256xf32>
+
+  // Mean should compute reduction over NHWC spatial dimensions.
+  // CHECK: %[[MEAN_DIMS:[0-9]*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>}
+  // CHECK: %[[MEAN:[0-9]*]] = "tf.Mean"(%[[RELU]], %[[MEAN_DIMS]])
+  // CHECK-SAME: (tensor<?x56x56x256xf32>, tensor<2xi32>) -> tensor<?x256xf32>
+  // CHECK: return %[[MEAN]] : tensor<?x256xf32>
+
+  return %16 : tensor<?x256xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
new file mode 100644
index 00000000000..be23da672e5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
@@ -0,0 +1,194 @@
+// RUN: tf-opt %s -tf-parallel-execute-to-islands | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @check_regions_to_islands
+func @check_regions_to_islands() {
+  tf_executor.graph {
+    tf_executor.island() {
+      "tf_device.parallel_execute"() ({
+        tf_device.return
+      },
+      {
+        tf_device.return
+      }) {} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:      %[[ISLAND_INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield
+// CHECK:      %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_INPUT_CTL]]) {
+// CHECK:        tf_executor.yield
+// CHECK:      %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_INPUT_CTL]]) {
+// CHECK:        tf_executor.yield
+// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
+// CHECK-NEXT:   tf_executor.yield
+
+
+// CHECK-LABEL: func @check_regions_to_islands_with_inputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_regions_to_islands_with_inputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    tf_executor.island() {
+      "tf_device.parallel_execute"() ({
+        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
+// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
+// CHECK-NEXT:   tf_executor.yield
+
+
+// CHECK-LABEL: func @check_input_sink_island_forwards_control_inputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_input_sink_island_forwards_control_inputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %7 = tf_executor.ControlTrigger {}
+    %8 = tf_executor.ControlTrigger {}
+    tf_executor.island(%7, %8) {
+      "tf_device.parallel_execute"() ({
+        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"() : () -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK: %[[CT_0:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK: %[[CT_1:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island(%[[CT_0]], %[[CT_1]]) {
+// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[INPUT_CONTROL]]) {
+// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"() : () -> tensor<i32>
+// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
+// CHECK-NEXT:   tf_executor.yield
+
+
+// CHECK-LABEL: func @check_control_dep_added_when_region_does_not_have_inputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_control_dep_added_when_region_does_not_have_inputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %7:3 = tf_executor.island() {
+      %8:2 = "tf_device.parallel_execute"() (
+      {
+        %3 = "tf.opB"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+       }
+       ) {} : () -> (tensor<i1>, tensor<i32>)
+
+      tf_executor.yield %8#0, %8#1 : tensor<i1>, tensor<i32>
+    }
+
+    tf_executor.island {
+      "tf.opD"(%7#0, %7#1) : (tensor<i1>, tensor<i32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK:      %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:      %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
+// CHECK-NEXT:   %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
+// CHECK:        tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
+// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:      %{{.*}} = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]]
+
+
+// CHECK-LABEL: func @check_output_barrier_correctly_forwards_outputs
+func @check_output_barrier_correctly_forwards_outputs(%arg0 : tensor<i1>) -> tensor<i1> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %8:3 = tf_executor.island() {
+      %7:2 = "tf_device.parallel_execute"() ({
+        %3 = "tf.opB"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %7#0, %7#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %8#0 : tensor<i1>
+  }
+  return %0 : tensor<i1>
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
+// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
+// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:       %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i32>
+// CHECK:         tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:       %[[OUTPUT_SINK_OUTPUT:[a-z_0-9]*]]:2, %[[OUTPUT_SINK_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]] : tensor<i1>, tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 9b29c5c1d92..319660ae4bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -854,6 +854,78 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // -----
 
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOp
+func @testValidMatrixBandPartOp(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
+}
+
+// -----
+
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOp3D
+func @testValidMatrixBandPartOp3D(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64x64xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64x64xbf16>
+  return %0 : tensor<64x64x64xbf16>
+}
+
+// -----
+
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpUnranked
+func @testValidMatrixBandPartOpUnranked(%arg0: tensor<*xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<*xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// -----
+
+// Test invalid tf.MatrixBandPart
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
+}
+
+// -----
+
+// Test invalid tf.MatrixBandPart
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// -----
+
+// Test invalid tf.MatrixBandPart
+func @testInvalidMatrixBandPartOp(%arg0: tensor<i64>, %arg1: tensor<64x64xi64>, %arg2: tensor<i64>) -> tensor<i64> {
+  // expected-error @+1 {{op requires `input` to have rank of at least 2, but found 'tensor<i64>'}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<i64>, tensor<64x64xi64>, tensor<i64>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// -----
+
+// Test invalid tf.MatrixBandPart
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64xi64>, %arg1: tensor<32xi64>, %arg2: tensor<i64>) -> tensor<64x64xi64> {
+  // expected-error @+1 {{op requires `num_lower` to have 0 dimensions, but found 'tensor<32xi64>'}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64xi64>, tensor<32xi64>, tensor<i64>) -> tensor<64x64xi64>
+  return %0 : tensor<64x64xi64>
+}
+
+// -----
+
+// Test invalid tf.MatrixBandPart
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64xi64>, %arg1: tensor<i64>, %arg2: tensor<32xi64>) -> tensor<64x64xi64> {
+  // expected-error @+1 {{op requires `num_upper` to have 0 dimensions, but found 'tensor<32xi64>'}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64xi64>, tensor<i64>, tensor<32xi64>) -> tensor<64x64xi64>
+  return %0 : tensor<64x64xi64>
+}
+
+// -----
+
 //===--------------------------------------------------------------------===//
 //  tf.{|Stateful}PartitionedCall
 //===--------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index 4248099637c..78c18a17d4a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -47,7 +47,7 @@ class TestModule(tf.Module):
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
   # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[VAR]]},
-  # CHECK-SAME:   %arg2: tensor<f32> {tf_saved_model.bound_input = @[[CONST]]}) -> (
+  # CHECK-SAME:   %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[CONST]]}) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = []})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["some_function"]
   @tf.function(input_signature=[tf.TensorSpec([], tf.float32)])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir
deleted file mode 100644
index 365a5a3f402..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: tf-opt -tf-saved-model-inline-global-tensors -split-input-file %s | FileCheck %s --dump-input=fail
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Simple case of inlining.
-
-  // CHECK-NOT: tf_saved_model.global_tensor
-  "tf_saved_model.global_tensor"() { sym_name = "c", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
-
-  // CHECK: func @f()
-  func @f(%arg0: tensor<f32> {tf_saved_model.bound_input = @c})
-  attributes {tf_saved_model.exported_names = ["f"]} {
-    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Do not inline mutable global tensors.
-
-  // CHECK: tf_saved_model.global_tensor
-  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
-
-  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
-  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
-  attributes {tf_saved_model.exported_names = ["f"]} {
-    // CHECK-NOT: tf.Const
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Sanity check handling of non-bound inputs.
-  // The pass shouldn't do anything in this case.
-
-  // CHECK: func @f(%arg0: tensor<f32> {tf_saved_model.index_path = [0]})
-  func @f(%arg0: tensor<f32> {tf_saved_model.index_path = [0]})
-  attributes {tf_saved_model.exported_names = ["f"]} {
-    // CHECK-NOT: tf.Const
-    return
-  }
-
-}
-
-// TODO: have an arg that isn't a bound input.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index 1bf172b2655..df8e14b8d9e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -25,7 +25,7 @@ module attributes {tf_saved_model.semantics} {
   // CHECK: func @__concrete_function_run_computation
   func @__concrete_function_run_computation(
     %arg0: tensor<f32> {tf_saved_model.index_path = [0, "foo"]},
-    %arg1: tensor<1x64xf32> {tf_saved_model.bound_input = @some_constant},
+    %arg1: tensor<!tf.resource<tensor<1x64xf32>>> {tf_saved_model.bound_input = @some_constant},
     %arg2: tensor<!tf.resource<tensor<?x64xf32>>> {tf_saved_model.bound_input = @some_variable}
   ) -> (
     tensor<f32> {tf_saved_model.index_path = [0, "bar"]}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index 6e6c8ae3821..f892b5dfb8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -244,7 +244,7 @@ module attributes {tf_saved_model.semantics} {
 module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
-  // expected-error@+1 {{mutable bound input with type 'tensor<f32>' expected to have type 'tensor<!tf.resource<tensor<?xf32>>>'}}
+  // expected-error@+1 {{bound input with type 'tensor<f32>' expected to have type 'tensor<!tf.resource<tensor<?xf32>>>'}}
   func @f(%arg0: tensor<f32> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     return
@@ -253,18 +253,6 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-module attributes {tf_saved_model.semantics} {
-
-  "tf_saved_model.global_tensor"() { sym_name = "v", type = tensor<1xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
-  // expected-error@+1 {{bound input for immutable 'tf_saved_model.global_tensor' must match the global tensor's type}}
-  func @f(%arg0: tensor<!tf.resource<tensor<1xf32>>> {tf_saved_model.bound_input = @v})
-  attributes {tf_saved_model.exported_names = ["f"]} {
-    return
-  }
-}
-
-// -----
-
 module attributes {tf_saved_model.semantics} {
 
   // expected-error@+1 {{'type' attribute for immutable 'tf_saved_model.global_tensor' should have a static shape}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
index f2a4373c777..f985be16ab8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
@@ -6,19 +6,16 @@
 
 module attributes {tf_saved_model.semantics} {
 
-  // Test case: Basic test of freezing.
+  // Test case: Basic test of marking immutable.
 
   // CHECK: "tf_saved_model.global_tensor"() {
   // CHECK-NOT: is_mutable
   // CHECK-SAME: } : () -> ()
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
-  // CHECK: func @f(%arg0: tensor<f32> {tf_saved_model.bound_input = @v})
   func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}) -> (tensor<f32> {tf_saved_model.index_path = []})
   attributes {tf_saved_model.exported_names = ["f"]} {
-    // CHECK-NOT: tf.ReadVariableOp
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
-    // CHECK: return %arg0
     return %val : tensor<f32>
   }
 
@@ -28,18 +25,16 @@ module attributes {tf_saved_model.semantics} {
 
 module attributes {tf_saved_model.semantics} {
 
-  // Test case: Don't freeze if the variable is mutated.
+  // Test case: Don't mark immutable if the variable is mutated.
 
   // CHECK: "tf_saved_model.global_tensor"() {
   // CHECK-SAME: is_mutable
   // CHECK-SAME: } : () -> ()
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
-  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
   func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
-    // CHECK: tf.AssignVariableOp
     "tf.AssignVariableOp"(%arg0, %c0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
     return
   }
@@ -50,14 +45,13 @@ module attributes {tf_saved_model.semantics} {
 
 module attributes {tf_saved_model.semantics} {
 
-  // Test case: Don't freeze if the variable is exported.
+  // Test case: Don't mark immutable if the variable is exported.
 
   // CHECK: "tf_saved_model.global_tensor"() {
   // CHECK: is_mutable
   // CHECK-SAME: } : () -> ()
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", tf_saved_model.exported_names = ["v"], type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
-  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
   func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}) -> (tensor<f32> {tf_saved_model.index_path = []})
   attributes {tf_saved_model.exported_names = ["f"]} {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
@@ -71,7 +65,7 @@ module attributes {tf_saved_model.semantics} {
 
 module attributes {tf_saved_model.semantics} {
 
-  // Test case: Check that a non-bound input is not modified.
+  // Test case: Check that a non-bound input is left unchanged.
 
   // CHECK: func @g
   func @g(%arg0: tensor<f32> {tf_saved_model.index_path = [0]}) -> (tensor<f32> {tf_saved_model.index_path = []})
@@ -86,14 +80,16 @@ module attributes {tf_saved_model.semantics} {
 
 module attributes {tf_saved_model.semantics} {
 
-  // Test case: Check that an immutable bound input isn't modified.
+  // Test case: Check that no change is made for a global tensor that is already
+  // immutable.
 
   "tf_saved_model.global_tensor"() { sym_name = "c", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
-  // CHECK: func @h(%arg0: tensor<f32> {tf_saved_model.bound_input = @c})
-  func @h(%arg0: tensor<f32> {tf_saved_model.bound_input = @c}) -> (tensor<f32> {tf_saved_model.index_path = []})
+  // CHECK: func @h(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @c})
+  func @h(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @c})
   attributes {tf_saved_model.exported_names = ["h"]} {
-    return %arg0 : tensor<f32>
+    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    return
   }
 
 }
@@ -134,7 +130,7 @@ module attributes {tf_saved_model.semantics} {
   "tf_saved_model.global_tensor"() { sym_name = "c", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
   // CHECK: func @f()
-  func @f(%arg0: tensor<f32> {tf_saved_model.bound_input = @c})
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @c})
   attributes {tf_saved_model.exported_names = ["f"]} {
     return
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index 2d4c4293ab6..5b657912ca9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -102,18 +102,19 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func @main(%arg0: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
              %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
              %arg2: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-             %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"}) {
+             %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
+             %arg4: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+             %arg5: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"}) {
     %0 = "tf.Const"() {value = dense<100> : tensor<i32>} : () -> tensor<i32>
-    %1:5 = "tf.While"(%0, %arg0, %arg1, %arg2, %arg3)
-               {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE"], body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false,
-                output_shapes = ["tfshape$", "tfshape$", "tfshape$", "tfshape$", "tfshape$"]}
+    %1:7 = "tf.While"(%0, %arg0, %arg1, %arg2, %arg3, %arg4, %arg5)
+               {body = @while_body_7560,
+                cond = @while_cond_7550, device = "", is_stateless = false}
          : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+            tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
          -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+             tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
     return
   }
   // CHECK: func @while_body_7560
@@ -122,9 +123,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
                         %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
                         %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
                         %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
+                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
+                        %arg5: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+                        %arg6: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"})
         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) {
+            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+            tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>) {
     %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
     %1 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %2:2 = "tf._TPUCompileMlir"() {
@@ -133,27 +137,33 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
       mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
     "tf.TPUCompileSucceededAssert"(%2#0) : (tensor<!tf.string>) -> ()
-    %new_var = "tf._UnknownOp0_"(%arg3) : (tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) -> tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
+    %id0 = "tf.Identity"(%arg3) : (tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) -> tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
+    "tf._Unknown_"(%id0) : (tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) -> ()
+    %newvar = "tf._SomeOp"() : () -> tensor<*x!tf.resource<tensor<f32>>>
     tf_device.replicate([%arg1, %arg2] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
-                        [%new_var, %arg4] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+                        [%arg3, %arg4] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+                        [%newvar, %arg6] as %arg32: tensor<*x!tf.resource<tensor<f32>>>)
             {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
-      // %arg30 is used in the cond function, and %arg31 is not pass-through of
-      // while inputs, so neither should be formatted.
-      "tf.TPUExecuteAndUpdateVariables"(%arg30, %arg31, %2#1)
+      // %arg30 is used in the cond function, %arg31 has other uses (%id0), and
+      // %arg32 is not a pass-through.
+      "tf.TPUExecuteAndUpdateVariables"(%arg30, %arg31, %arg32, %2#1)
             {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-              : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+              : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+                 tensor<*x!tf.resource<tensor<f32>>>, tensor<!tf.string>) -> ()
       tf_device.return
     }
-    return %1, %arg1, %arg2, %arg3, %arg4 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
+    return %1, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
               tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-              tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
+              tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>
   }
   // CHECK-LABEL: func @while_cond_7550
   func @while_cond_7550(%arg0: tensor<i32>,
                         %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
                         %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
                         %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
+                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
+                        %arg5: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+                        %arg6: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"})
        -> tensor<i1> {
     %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
     %1 = "tf.GreaterEqual"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index d61b8d8d8b8..c4176115926 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -891,35 +891,3 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return %0 : tensor<?xi32>
   }
 }
-
-// -----
-
-// Tests simple case of launch_func on TPU with replication with multiple logical cores.
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
-  // CHECK-LABEL: func @replicated_tpu_launch_func_with_multiple_logical_cores
-  func @replicated_tpu_launch_func_with_multiple_logical_cores(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-    // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK-SAME: ([%[[A_OUTPUT]], %[[ARG_0]]] as %[[RI_0:[a-z0-9]*]]: tensor<?xi32>)
-    // CHECK-SAME: n = 2
-    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-      // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf._TPUCompileMlir"
-      // CHECK:       "tf_device.parallel_execute"()
-      // CHECK-NEXT:  %[[LAUNCH:[0-9]*]] = "tf_device.launch"() ( {
-      // CHECK-NEXT:  %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[ARG_1]], %[[COMPILE_OUTPUT]]#1)
-      // CHECK:  %[[LAUNCH:[0-9]*]] = "tf_device.launch"() ( {
-      // CHECK-NEXT:  %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[ARG_1]], %[[COMPILE_OUTPUT]]#1)
-      %2 = "tf_device.launch_func"(%ri_0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return %2 : tensor<?xi32>
-    }
-
-    %2 = "tf.C"(%1#1) : (tensor<?xi32>) -> tensor<?xi32>
-    return %2 : tensor<?xi32>
-  }
-
-  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-    return %0 : tensor<?xi32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
index f181924d0a6..0fef58ebb8a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // `tf_device.launch` with equivalent `tf_device.launch_func` operations.
 
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index a95a319d0a4..bac7b9ba01c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 // Here, the element type can be any integer or float type. But, note that only
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
index 9660367cb68..ad844883453 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Visitors.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index 57ea1822b5b..01901d8b5a4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/SymbolTable.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index 44309a5e019..4d5ad5ad423 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index 31898ec1048..8cfa69c396e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This transformation pass transforms functional control flow operations in the
 // standard TensorFlow dialect to MLIR Control Flow Graph (CFG) form.
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/inline_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/inline_global_tensors.cc
deleted file mode 100644
index 6d780d08d6b..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/inline_global_tensors.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This pass will replace a func's bound inputs which are bound to constant
-// global tensors with tf.Const ops inside the func's body.
-// This can be useful when bringing up backends since it allows running
-// stateless models before implementing global tensor support.
-
-#include <map>
-#include <set>
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "mlir/IR/Attributes.h"  // TF:llvm-project
-#include "mlir/IR/Builders.h"  // TF:llvm-project
-#include "mlir/IR/Module.h"  // TF:llvm-project
-#include "mlir/Pass/Pass.h"  // TF:llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-
-namespace mlir {
-namespace tf_saved_model {
-namespace {
-struct InlineGlobalTensorsPass : public ModulePass<InlineGlobalTensorsPass> {
-  void runOnModule() override;
-};
-
-void InlineGlobalTensorsPass::runOnModule() {
-  auto module = getModule();
-  SymbolTable symbol_table(module);
-  for (auto func : module.getOps<FuncOp>()) {
-    SmallVector<unsigned, 4> args_to_erase;
-    OpBuilder builder(func.getBody());
-    for (int i = 0, e = func.getNumArguments(); i < e; i++) {
-      auto global_tensor = LookupBoundInput(func, i, symbol_table);
-      if (!global_tensor) continue;
-
-      // Don't inline mutable global tensors. They could be holding state across
-      // invocations to this function.
-      if (global_tensor.is_mutable()) continue;
-
-      // Replace the arg with a tf.Const op in the function body.
-      auto const_op = builder.create<TF::ConstOp>(global_tensor.getLoc(),
-                                                  global_tensor.value());
-      func.getArgument(i).replaceAllUsesWith(const_op.getResult());
-      args_to_erase.push_back(i);
-    }
-    func.eraseArguments(args_to_erase);
-  }
-  // We have already inlined all constant tensors, so erase them.
-  for (auto global_tensor :
-       llvm::make_early_inc_range(module.getOps<GlobalTensorOp>())) {
-    if (!global_tensor.is_mutable()) global_tensor.erase();
-  }
-}
-
-}  // namespace
-
-// For "opt" to pick up this pass.
-static PassRegistration<InlineGlobalTensorsPass> pass(
-    "tf-saved-model-inline-global-tensors",
-    "Inline tf_saved_model.global_tensor's as tf.Const ops in func bodies.");
-
-std::unique_ptr<OpPassBase<ModuleOp>> CreateInlineGlobalTensorsPass() {
-  return std::make_unique<InlineGlobalTensorsPass>();
-}
-
-}  // namespace tf_saved_model
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index d642b093e6b..e6c4024d5ec 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // TF:llvm-project
 #include "mlir/Transforms/Passes.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 #define DEBUG_TYPE "tf-layout-optimization"
 
@@ -30,16 +31,6 @@ namespace TF {
 
 namespace {
 
-// Layout optimization pipeline composes layout assignment and move transposes
-// passes to pick the optimal layout for all layout sensitive operations, and
-// cancel all redundant transposes.
-struct LayoutOptimizationPipelineOptions
-    : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
-  Option<std::string> force_data_format{
-      *this, "force-data-format",
-      llvm::cl::desc("Force data format for all layout sensitive ops")};
-};
-
 // LayoutAssignmentPass assigns optimal data layout (data format) for all
 // layout sensitive operations.
 class LayoutAssignmentPass : public FunctionPass<LayoutAssignmentPass> {
@@ -83,7 +74,7 @@ class MoveTransposesPass : public FunctionPass<MoveTransposesPass> {
           clEnumValN(Direction::kEnd, "end", "end of the block"))};
 };
 
-using Permutation = SmallVector<int64_t, 4>;
+using Permutation = SmallVector<int32_t, 4>;
 
 Permutation GetDataFormatPermutation(StringRef from_data_format,
                                      StringRef to_data_format) {
@@ -96,22 +87,6 @@ Permutation GetDataFormatPermutation(StringRef from_data_format,
   }
 }
 
-Type PermuteRankedTensorType(Type type, Permutation permutation) {
-  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
-    ArrayRef<int64_t> shape = ranked_type.getShape();
-    assert(permutation.size() == shape.size());
-
-    SmallVector<int64_t, 4> new_shape(permutation.size());
-    for (size_t i = 0; i < permutation.size(); ++i) {
-      new_shape[i] = shape[permutation[i]];
-    }
-
-    return RankedTensorType::get(new_shape, ranked_type.getElementType());
-  }
-
-  return type;
-}
-
 void LayoutAssignmentPass::runOnFunction() {
   FuncOp func = getFunction();
 
@@ -139,13 +114,13 @@ void LayoutAssignmentPass::runOnFunction() {
     OpBuilder builder(op->getBlock());
 
     auto perm_attr = [&](Permutation permutation) -> DenseIntElementsAttr {
-      auto perm_ty = RankedTensorType::get({4}, builder.getIntegerType(64));
+      auto perm_ty = RankedTensorType::get({4}, builder.getIntegerType(32));
       return DenseIntElementsAttr::get(perm_ty, permutation);
     };
 
     // Change operation data format.
-    op->setAttr("data_format",
-                StringAttr::get(force_data_format_, op->getContext()));
+    if (failed(layout_sensitive_interface.UpdateDataFormat(force_data_format_)))
+      return;
 
     // Permute arguments into the target data format.
     builder.setInsertionPoint(op);
@@ -162,8 +137,6 @@ void LayoutAssignmentPass::runOnFunction() {
 
     for (int64_t res : layout_sensitive_interface.GetLayoutDependentResults()) {
       OpResult result = op->getResult(res);
-      result.setType(
-          PermuteRankedTensorType(result.getType(), args_permutation));
 
       auto transposed_res = builder.create<TransposeOp>(loc, result, res_perm);
       result.replaceAllUsesWith(transposed_res);
@@ -426,11 +399,15 @@ void MoveTransposesPass::runOnFunction() {
   });
 }
 
+}  // namespace
+
 void CreateLayoutOptimizationPipeline(
     OpPassManager& pm,  // NOLINT - MLIR contract is pass by mutable reference.
     const LayoutOptimizationPipelineOptions& options) {
   using Direction = MoveTransposesPass::Direction;
 
+  if (options.force_data_format.empty()) return;
+
   // Assign optimal layout for layout sensitive ops.
   pm.addPass(std::make_unique<LayoutAssignmentPass>(options.force_data_format));
 
@@ -441,8 +418,6 @@ void CreateLayoutOptimizationPipeline(
   pm.addPass(std::make_unique<MoveTransposesPass>(Direction::kEnd));
 }
 
-}  // namespace
-
 static PassRegistration<LayoutAssignmentPass> layout_assignment(
     "tf-layout-assignment", "Layout assignment pass");
 static PassRegistration<MoveTransposesPass> move_transposes(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index ec0ac5e3c1e..1074f9e1926 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 // Here, the element type can be any integer or float type. But, note that only
@@ -122,7 +122,7 @@ def LowerSparseSoftmaxCrossEntropyWithLogitsOp : Pattern<
 //===----------------------------------------------------------------------===//
 
 def ComplexTensor   : TensorOf<[AnyComplex]>;
-def RealTensor   : TensorOf<[AnyInteger, AnyFloat]>;
+def RealTensor   : TensorOf<[AnySignlessInteger, AnyFloat]>;
 
 def : Pat<(TF_SquareOp $val), (TF_MulOp $val, $val)>;
 
@@ -179,7 +179,7 @@ def LowerL2LossOp :
 // Pad op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_PadOp TensorOf<[AnyInteger, AnyFloat]>:$input, $paddings),
+def : Pat<(TF_PadOp TensorOf<[AnySignlessInteger, AnyFloat]>:$input, $paddings),
           (TF_PadV2Op $input, $paddings,
              (TF_ConstOp (GetScalarOfType<0> $input)))>;
 
@@ -224,6 +224,6 @@ def CreateTFShapeOp : NativeCodeCall<
 
 // TODO(hinsu): Support inputs of TensorList types.
 def LowerZerosLikeOp :
-  Pat<(TF_ZerosLikeOp:$src_op TensorOf<[AnyInteger, AnyFloat]>:$input),
+  Pat<(TF_ZerosLikeOp:$src_op TensorOf<[AnySignlessInteger, AnyFloat]>:$input),
       (TF_BroadcastToOp (TF_ConstOp (GetScalarOfType<0> $input)),
                         (CreateTFShapeOp $src_op, $input, /*use 32bit*/ConstBoolAttrFalse))>;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index 36d7712eb2c..acaf7974280 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <iostream>
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
index 87467238e57..0fb62cb064d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.td
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 def IsDataFormatNHWC : ConstantAttr<TF_ConvnetDataFormatAttr, "NHWC">;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 3ad607de38a..2a61b0cf0d1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -45,37 +45,27 @@ struct GlobalTensorUse {
 using GlobalTensorUsesMap =
     std::map<GlobalTensorOp, std::vector<GlobalTensorUse>>;
 
-// TODO(silvasean): Are there other read-only variable ops?
-// It would be nice if we eventually had an interface that we could use
-// to determine if an op is read-only and how to rewrite it.
-// For now, IsReadOnlyVariableOp and RewriteReadOnlyVariableOpToTensorOp need to
-// be keep in sync.
-bool IsReadOnlyVariableOp(Operation* op) { return isa<TF::ReadVariableOp>(op); }
-
-void RewriteReadOnlyVariableOpToTensorOp(Operation* op, Value tensor_value) {
-  auto read_variable = cast<TF::ReadVariableOp>(op);
-  read_variable.value().replaceAllUsesWith(tensor_value);
-}
-
-bool IsFreezable(GlobalTensorOp global_tensor,
+bool IsImmutable(GlobalTensorOp global_tensor,
                  ArrayRef<GlobalTensorUse> global_tensor_uses) {
-  // If this tensor is already immutable, don't freeze it.
+  // Global tensor is already known to be immutable.
   if (!global_tensor.is_mutable()) {
     return false;
   }
-  // Can't freeze if exported.
+  // An exported global tensor that is not already known to be immutable might
+  // be externally mutated.
   if (IsExported(global_tensor)) {
     return false;
   }
 
-  // Can't freeze if it is used by anything that we aren't sure is read-only.
+  // Check the uses to see if this global tensor is only used in a way that
+  // is compatible with being immutable.
   // Right now, this uses a very simple algorithm that only checks the top-level
   // func for tf.ReadVariableOp. If the resource is passed into other functions
   // or control flow, we fail to prove it is freezable even though we could.
   for (auto& global_tensor_use : global_tensor_uses) {
     auto arg = global_tensor_use.func.getArgument(global_tensor_use.arg_index);
     for (auto user : arg.getUsers()) {
-      if (!IsReadOnlyVariableOp(user)) {
+      if (!isa<TF::ReadVariableOp>(user)) {
         return false;
       }
     }
@@ -103,37 +93,16 @@ static GlobalTensorUsesMap CreateGlobalTensorUsesMap(ModuleOp module) {
   return global_tensor_uses;
 }
 
-void FreezeGlobalTensors(ModuleOp module,
-                         const GlobalTensorUsesMap& global_tensor_uses_map) {
-  // Remove `is_mutable` attribute from tf_saved_model.global_tensor
-  // and update func arguments to match.
-  //
-  // This amounts to changing the type of the argument to a tensor type, and
-  // replacing all the tf.ReadVariableOp's with the new tensor argument value.
-  OpBuilder builder(module.getBodyRegion());
+// Removes `is_mutable` attribute from tf_saved_model.global_tensor ops where we
+// can prove it is safe to do so.
+void MarkGlobalTensorsImmutable(
+    ModuleOp module, const GlobalTensorUsesMap& global_tensor_uses_map) {
   for (const auto& kv : global_tensor_uses_map) {
     auto global_tensor = kv.first;
     const auto& global_tensor_uses = kv.second;
-    if (!IsFreezable(global_tensor, global_tensor_uses)) {
-      continue;
+    if (IsImmutable(global_tensor, global_tensor_uses)) {
+      global_tensor.removeAttr("is_mutable");
     }
-    for (auto global_tensor_use : global_tensor_uses) {
-      auto func = global_tensor_use.func;
-      auto arg_index = global_tensor_use.arg_index;
-      Value arg = func.getArgument(arg_index);
-      for (Operation* user : llvm::make_early_inc_range(arg.getUsers())) {
-        RewriteReadOnlyVariableOpToTensorOp(user, arg);
-        user->erase();
-      }
-      Type new_type = global_tensor.value().Attribute::getType();
-      arg.setType(new_type);
-      auto old_ftype = func.getType();
-      auto input_types = old_ftype.getInputs().vec();
-      input_types[arg_index] = new_type;
-      func.setType(
-          builder.getFunctionType(input_types, old_ftype.getResults()));
-    }
-    global_tensor.removeAttr("is_mutable");
   }
 }
 
@@ -178,7 +147,7 @@ void OptimizeGlobalTensorsPass::runOnModule() {
   // Figure out which func's use each tf_saved_model.global_tensor.
   GlobalTensorUsesMap global_tensor_uses = CreateGlobalTensorUsesMap(module);
 
-  FreezeGlobalTensors(module, global_tensor_uses);
+  MarkGlobalTensorsImmutable(module, global_tensor_uses);
   EraseUnusedGlobalTensors(module, global_tensor_uses);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
new file mode 100644
index 00000000000..5caf08c672e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -0,0 +1,263 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This pass forms `tf_executor.island` per region of
+// `tf_device.parallel_execute`.
+//
+// For example:
+//  %1:2 = tf_executor.island {
+//    %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+//      tf_executor.yield %2 : tensor<i1>
+//  }
+//  tf_executor.island() {
+//    "tf_device.parallel_execute"() ({
+//      %3 = "tf.opB"() : () -> tensor<i1>
+//      tf_device.return %3 : tensor<i1>
+//    },
+//    {
+//      %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+//      tf_device.return
+//    }) {} : () -> (tensor<i1>)
+//    tf_executor.yield
+//  }
+//  tf_executor.fetch
+//
+//  Would become:
+//    %1:2 = tf_executor.island {
+//      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+//      tf_executor.yield %2 : tensor<i1>
+//    }
+//
+//    // Input barrier sink island that forwards all inputs.
+//    %output_0, %control_1 = tf_executor.island {
+//      tf_executor.yield %1#0: tensor<i1>
+//    }
+//
+//    // Island for the first region of above parallel_execute.
+//    %output_2, %control_3 = tf_executor.island(%control_1) {
+//      %3 = "tf.opB"() : () -> tensor<i1>
+//      tf_executor.yield %3 : tensor<i1>
+//    }
+//
+//    // Island for the second region of above parallel_execute.
+//    %control_5 = tf_executor.island {
+//        %5 = "tf.opC"(%output_0) : (tensor<i1>) -> tensor<i32>
+//      tf_executor.yield
+//    }
+//
+//    // Output barrier sink island that forwards all outputs.
+//    %output_5, %control_6 = tf_executor.island(%control_5) {
+//      tf_executor.yield %output_2
+//    }
+//
+//  When tf_device.parallel_execute op is enclosed after tf_device.replicate,
+//  then this pass will run following `replicate-to-island` pass and
+//  `tf-executor-break-up-islands` pass.
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Block.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+
+namespace mlir {
+namespace TFDevice {
+namespace {
+
+struct ParallelExecuteToIslandsPass
+    : public FunctionPass<ParallelExecuteToIslandsPass> {
+  void runOnFunction() override;
+};
+
+// Convert parallel_execute op to a set of islands where each region of
+// parallel_execute op becomes a separate island. This ensures that
+// regions of parallel_execute op gets executed concurrently.
+LogicalResult ExpandParallelExecuteToIslands(
+    tf_executor::IslandOp island_op, tf_executor::IslandOp input_sink_island,
+    tf_device::ParallelExecuteOp parallel_execute_op, OpBuilder* builder,
+    llvm::SmallVector<tf_executor::IslandOp, 4>* islands) {
+  const int num_executions =
+      parallel_execute_op.getOperation()->getNumRegions();
+  llvm::SmallVector<tf_executor::IslandOp, 4> executions;
+  executions.reserve(num_executions);
+  builder->setInsertionPoint(island_op);
+
+  auto control_type = tf_executor::ControlType::get(island_op.getContext());
+  for (int i : llvm::seq<int>(0, num_executions)) {
+    auto execute_region =
+        parallel_execute_op.GetRegionBlockWithIndex(i).getParent();
+
+    // If region does not have any inputs, then add explicit control dependency
+    // from the input sink island. This guarantees that all inputs of
+    // parallel_execute op must be materialized before any of the islands are
+    // executed.
+    llvm::SetVector<Value> region_inputs;
+    getUsedValuesDefinedAbove(*execute_region, region_inputs);
+    llvm::SmallVector<Value, 8> execution_control_inputs;
+    if (region_inputs.empty())
+      execution_control_inputs.emplace_back(input_sink_island.control());
+
+    // Collect result types and operands.
+    Operation* terminator = execute_region->front().getTerminator();
+    llvm::SmallVector<Type, 8> output_types(terminator->getOperandTypes());
+
+    // Replace terminator with YieldOp as island op always ends with yield op.
+    builder->setInsertionPoint(terminator);
+    builder->create<tf_executor::YieldOp>(terminator->getLoc(),
+                                          terminator->getOperands());
+    terminator->erase();
+
+    // Create new island for each region.
+    builder->setInsertionPoint(island_op);
+    auto execution_island = builder->create<tf_executor::IslandOp>(
+        island_op.getLoc(), output_types, control_type,
+        execution_control_inputs);
+
+    // Move over tf_device.parallel_execute body region into newly a
+    // created island.
+    execution_island.body().takeBody(*execute_region);
+    islands->push_back(execution_island);
+  }
+
+  return success();
+}
+
+// Creates an island that works as input sync point for islands. This guarantees
+// that all (implicitly captured) inputs of parallel_execute are materialized
+// before any of the islands are executed.
+tf_executor::IslandOp CreateInputBarrierIsland(
+    OpBuilder* builder, tf_executor::IslandOp island_op) {
+  builder->setInsertionPoint(island_op);
+
+  llvm::SetVector<Value> island_inputs;
+  getUsedValuesDefinedAbove(island_op.body(), island_inputs);
+
+  llvm::SmallVector<Type, 8> input_types;
+  input_types.reserve(island_inputs.size());
+  for (const auto& input_val : island_inputs)
+    input_types.emplace_back(input_val.getType());
+
+  // Create new island for that forwards all inputs.
+  auto control_type = tf_executor::ControlType::get(island_op.getContext());
+  auto input_sink_island = builder->create<tf_executor::IslandOp>(
+      island_op.getLoc(), input_types, control_type, island_op.controlInputs());
+  input_sink_island.body().push_back(new Block);
+
+  for (auto input_index_and_value : llvm::enumerate(island_inputs)) {
+    int index = input_index_and_value.index();
+    Value input_value = input_index_and_value.value();
+    replaceAllUsesInRegionWith(input_value, input_sink_island.getResult(index),
+                               island_op.body());
+  }
+
+  // Create YieldOp for the new input sink island.
+  builder->setInsertionPointToEnd(&input_sink_island.GetBody());
+  builder->create<tf_executor::YieldOp>(island_op.getLoc(),
+                                        llvm::to_vector<8>(island_inputs));
+  return input_sink_island;
+}
+
+// Creates an islands that works as output sync point. This guarantees that
+// execution of all islands must be completed before op following
+// parallel_execute runs.
+tf_executor::IslandOp CreateOutputBarrierIsland(
+    OpBuilder* builder, tf_executor::IslandOp island_op,
+    llvm::SmallVectorImpl<tf_executor::IslandOp>* islands) {
+  // Add control dependency to island operand if island output has no uses.
+  llvm::SmallVector<Value, 8> island_operands;
+  for (auto& island : *islands)
+    if (island.use_empty()) island_operands.push_back(island.control());
+
+  // Create single island forwarding all island results.
+  builder->setInsertionPoint(island_op);
+  auto island_output_sink = builder->create<tf_executor::IslandOp>(
+      island_op.getLoc(), llvm::to_vector<8>(island_op.getResultTypes()),
+      island_operands, llvm::ArrayRef<NamedAttribute>{});
+  island_output_sink.body().push_back(new Block);
+  return island_output_sink;
+}
+
+LogicalResult CreateIslandsFromParallelExecute(
+    tf_executor::IslandOp island_op,
+    tf_device::ParallelExecuteOp parallel_execute_op) {
+  OpBuilder builder(island_op);
+  auto input_sink_island = CreateInputBarrierIsland(&builder, island_op);
+
+  // Create N islands where N is the number of regions inside parallel_execute
+  // op.
+  llvm::SmallVector<tf_executor::IslandOp, 4> islands;
+  auto result = ExpandParallelExecuteToIslands(
+      island_op, input_sink_island, parallel_execute_op, &builder, &islands);
+  if (failed(result)) return result;
+
+  // Remap all results of parallel_execute op with outputs from newly
+  // created islands.
+  llvm::SmallVector<Value, 8> parallel_execute_outputs;
+  parallel_execute_outputs.reserve(
+      parallel_execute_op.getOperation()->getNumResults());
+
+  for (auto island : islands)
+    for (auto output_value : island.outputs())
+      parallel_execute_outputs.emplace_back(output_value);
+
+  parallel_execute_op.getOperation()->replaceAllUsesWith(
+      parallel_execute_outputs);
+
+  auto island_output_sink =
+      CreateOutputBarrierIsland(&builder, island_op, &islands);
+
+  // Move island YieldOp over to new single island and remap island results.
+  island_op.GetYield().getOperation()->moveBefore(
+      &island_output_sink.GetBody(), island_output_sink.GetBody().begin());
+  island_op.replaceAllUsesWith(island_output_sink);
+  island_op.erase();
+
+  return success();
+}
+
+// Finds islands with a single `tf_device.parallel_execute` and create
+// individual islands per region of parallel_execute.
+void LowerSingleIslandParallelExecuteToIslands(
+    tf_executor::IslandOp island_op) {
+  if (!has_single_element(island_op.GetBody().without_terminator())) return;
+
+  if (auto parallel_execute_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
+          &island_op.GetBody().front()))
+    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
+}
+
+void ParallelExecuteToIslandsPass::runOnFunction() {
+  getFunction().walk([&](tf_executor::IslandOp island_op) {
+    LowerSingleIslandParallelExecuteToIslands(island_op);
+  });
+}
+}  // anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> CreateParallelExecuteToIslandsPass() {
+  return std::make_unique<ParallelExecuteToIslandsPass>();
+}
+
+static PassRegistration<ParallelExecuteToIslandsPass> pass(
+    "tf-parallel-execute-to-islands",
+    "Lowers device parallel_execute to executor islands");
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index ad6fc683b6d..16d8ddfb900 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -46,6 +46,19 @@ std::unique_ptr<OpPassBase<ModuleOp>> CreateTFShapeInferencePass();
 // Optimizes Tensorflow graph.
 std::unique_ptr<OpPassBase<FuncOp>> CreateTFOptimizePass();
 
+struct LayoutOptimizationPipelineOptions
+    : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
+  Option<std::string> force_data_format{
+      *this, "force-data-format",
+      llvm::cl::desc("Force data format for all layout sensitive ops")};
+};
+
+// Layout optimization assigns optimal data layout for layout sensitive
+// operations, and cancels all redundant transposes.
+void CreateLayoutOptimizationPipeline(
+    OpPassManager& pm,  // NOLINT - MLIR contract is pass by mutable reference.
+    const LayoutOptimizationPipelineOptions& options);
+
 struct StandardPipelineOptions
     : public PassPipelineOptions<StandardPipelineOptions> {
   Option<bool> enable_inliner{*this, "enable-inliner",
@@ -165,6 +178,10 @@ std::unique_ptr<OpPassBase<FuncOp>> CreateReplicateInvariantOpHoistingPass();
 // `tf_device.replicate` island.
 std::unique_ptr<OpPassBase<FuncOp>> CreateReplicateToIslandPass();
 
+// Creates a pass that creates `tf_executor.island` from a single
+// `tf_device.parallel_execute` island.
+std::unique_ptr<OpPassBase<FuncOp>> CreateParallelExecuteToIslandsPass();
+
 // Creates a pass that annotates whether a LaunchFuncOp's parameters have the
 // same data across replicas.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateAnnotateParameterReplicationPass();
@@ -211,10 +228,6 @@ namespace tf_saved_model {
 // Creates a pass that optimizes tf_saved_model.global_tensor ops.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateOptimizeGlobalTensorsPass();
 
-// Creates a pass that inlines global tensors as tf.Const ops in the function
-// body.
-std::unique_ptr<OpPassBase<ModuleOp>> CreateInlineGlobalTensorsPass();
-
 // Creates a pass that uses tf_saved_model dialect linkage information
 // to mark function visibility. That is, exported functions are marked with
 // public visibility while the other functions are marked with private
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index 13dda3ed0d0..d3cc508a490 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -27,7 +27,7 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 8dc21feca90..dee5e8b079f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index c44f0f97fd6..b3474e2faf1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Diagnostics.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
index a4a8c1ab95f..83451e130ba 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
@@ -34,15 +34,16 @@ class SimpleTFDeviceAssignmentPass
 
   void runOnFunction() override {
     Builder builder(&getContext());
-    getFunction().walk([this, &builder](Operation* op) {
+    Dialect* tf = getContext().getRegisteredDialect<TensorFlowDialect>();
+    getFunction().walk([&](Operation* op) {
       if (auto device_attr = op->getAttrOfType<StringAttr>("device")) {
         // We assign default device to ops with device attribute that is empty.
         if (device_attr.getValue() == "") {
           op->setAttr("device", builder.getStringAttr(default_device_));
         }
-      } else if (llvm::isa<ConstOp>(op)) {
-        // tf.Const may sometimes contain no device attribute. In this case, we
-        // assign it the default device.
+      } else if (op->getDialect() == tf) {
+        // Assign default device to all ops in Tensorflow dialect that do not
+        // have device attribute.
         op->setAttr("device", builder.getStringAttr(default_device_));
       }
     });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index e51fd10382d..a484251ddcd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -155,6 +156,7 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
 // TODO(lyandy): Support session handle and guaranteed consts.
 LogicalResult SetMetadataProtoFromLaunchFuncOp(
     tf_device::LaunchFuncOp op, int num_replicas, int num_cores_per_replica,
+    llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   metadata->set_num_replicas(num_replicas);
   metadata->set_num_cores_per_replica(num_cores_per_replica);
@@ -197,6 +199,10 @@ LogicalResult SetMetadataProtoFromLaunchFuncOp(
           padding_and_idx.index(), padding_attr_str.getValue()));
   }
 
+  if (xla_device_assignment.hasValue())
+    *metadata->mutable_device_assignment() =
+        std::move(xla_device_assignment.getValue());
+
   // Set args metadata in proto.
   for (auto operand_type_and_idx : llvm::enumerate(op.getOperandTypes())) {
     Type operand_type = operand_type_and_idx.value();
@@ -251,17 +257,19 @@ LogicalResult SetMetadataProtoFromLaunchFuncOp(
 
 // Create a `tf._TPUCompileMlir` that contains a MLIR module that is
 // functionally equivalent to the function referenced by launch_func.
-Operation* BuildCompileOp(tf_device::LaunchFuncOp launch_func, int num_replicas,
-                          int num_cores_per_replica,
-                          llvm::StringRef compilation_device,
-                          OpBuilder* builder) {
+Operation* BuildCompileOp(
+    tf_device::LaunchFuncOp launch_func, int num_replicas,
+    int num_cores_per_replica, llvm::StringRef compilation_device,
+    llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
+    OpBuilder* builder) {
   // TODO(b/139377366): Use tf_tpu.compile build method when it is defined.
   OperationState compile_op_state(launch_func.getLoc(), "tf._TPUCompileMlir");
 
   // Set metadata from attributes.
   tensorflow::tpu::TPUCompileMetadataProto metadata;
   if (failed(SetMetadataProtoFromLaunchFuncOp(
-          launch_func, num_replicas, num_cores_per_replica, &metadata)))
+          launch_func, num_replicas, num_cores_per_replica,
+          std::move(xla_device_assignment), &metadata)))
     return nullptr;
 
   std::string txt_metadata;
@@ -416,7 +424,7 @@ void RemapOutputsOfParallelExecute(tf_device::LaunchFuncOp launch_func,
 }
 
 void AssignDevicesToReplicatedExecute(
-    const llvm::SmallVector<std::string, 8>& execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
     tf_device::ReplicateOp replicate, Operation* execute_op,
     OpBuilder* builder) {
   // If computation is replicated, execution devices are assigned to the
@@ -426,14 +434,18 @@ void AssignDevicesToReplicatedExecute(
     // Model parallelism is not support for now. Therefore, assign all ops
     // in replicate op with virtual device alias specifying that ops will be
     // executed on the zeroth core.
+    llvm::SmallVector<llvm::StringRef, 4> replicate_execution_devices;
+    replicate_execution_devices.reserve(execution_devices.size());
+    for (const auto& replica_execution_devices : execution_devices)
+      replicate_execution_devices.push_back(replica_execution_devices.front());
+
     auto device_attr = builder->getNamedAttr(
         tensorflow::GetDeviceAliasForLogicalCore(0),
-        builder->getStrArrayAttr(llvm::SmallVector<llvm::StringRef, 4>{
-            execution_devices.begin(), execution_devices.end()}));
+        builder->getStrArrayAttr(replicate_execution_devices));
     replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attr));
   } else {
-    execute_op->setAttr(kDeviceAttr,
-                        builder->getStringAttr(execution_devices.front()));
+    execute_op->setAttr(
+        kDeviceAttr, builder->getStringAttr(execution_devices.front().front()));
   }
 }
 
@@ -526,21 +538,22 @@ LogicalResult Rewrite(
   int num_cores_per_replica = num_cores_per_replica_attr.getInt();
 
   // Determine compilation and execution devices.
-  std::string compilation_device;
-  llvm::SmallVector<std::string, 8> execution_devices;
-  auto status = tensorflow::GetTPUCompilationAndExecutionDevices(
-      devices, num_replicas, num_cores_per_replica, &compilation_device,
-      &execution_devices);
-  if (!status.ok())
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices, num_replicas, num_cores_per_replica, /*topology_attr=*/"",
+          /*device_assignment_attr=*/{});
+  if (!status_or_tpu_device_assignment.ok())
     return launch_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
-           << status.error_message();
+           << status_or_tpu_device_assignment.status().error_message();
 
   // Create compile op;
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
   builder->setInsertionPoint(launch_func);
-  Operation* compile_op =
-      BuildCompileOp(launch_func, num_replicas, num_cores_per_replica,
-                     compilation_device, builder);
+  Operation* compile_op = BuildCompileOp(
+      launch_func, num_replicas, num_cores_per_replica,
+      tpu_device_assignment.compilation_device,
+      std::move(tpu_device_assignment.xla_device_assignment), builder);
   if (!compile_op) return failure();
 
   // After rewrite, find if there is a TPUCompilationResultOp in the block with
@@ -566,8 +579,8 @@ LogicalResult Rewrite(
     // attributes to launch_op's within parallel_execute op.
   } else {
     execute_op = BuildExecuteOp(compile_op, launch_func, builder);
-    AssignDevicesToReplicatedExecute(execution_devices, replicate, execute_op,
-                                     builder);
+    AssignDevicesToReplicatedExecute(tpu_device_assignment.execution_devices,
+                                     replicate, execute_op, builder);
     launch_func.replaceAllUsesWith(execute_op);
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index e7bd44464d0..eb57e8ff742 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
@@ -119,8 +119,10 @@ struct TPUVariableRuntimeReformattingPass
   void runOnModule() override;
 };
 
-// Returns the earlier value of which `v` is an identity.
-Value SkipIdentity(Value v, bool allow_other_use) {
+// Returns the earlier value of which `v` is an identity. If `skipped` is
+// provided, it will be used to store the identity nodes skipped.
+Value SkipIdentity(Value v, bool allow_other_use,
+                   llvm::SmallPtrSet<Operation*, 4>* skipped = nullptr) {
   while (auto result = v.dyn_cast<OpResult>()) {
     if (!(allow_other_use || v.hasOneUse())) break;
     auto op = result.getDefiningOp();
@@ -128,6 +130,7 @@ Value SkipIdentity(Value v, bool allow_other_use) {
       break;
     }
     v = op->getOperand(result.getResultNumber());
+    if (skipped) skipped->insert(op);
   }
   return v;
 }
@@ -188,18 +191,14 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     if (data_type.getIntOrFloatBitWidth() == 64) continue;
 
     // We have found a mirrored variable which is an input to the replicated
-    // `execute`. Now set the enable_xla_sharding field in the metadata to
-    // inform the compile op.
-    auto metadata_arg = metadata.mutable_args(it->second);
-    metadata_arg->set_enable_xla_sharding(
-        ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED);
-
-    // Now find if this mirrored variable is a pass-through of while arguments.
+    // `execute`. Now find if this mirrored variable is a pass-through of while
+    // arguments.
     llvm::SmallVector<Value, 4> while_args;
     for (int64_t i = 0; i < num_replicas; ++i) {
+      llvm::SmallPtrSet<Operation*, 4> skipped_identities;
       auto replicate_operand =
           SkipIdentity(replicate.getOperand(num_replicas * replicate_arg + i),
-                       /*allow_other_use=*/false);
+                       /*allow_other_use=*/false, &skipped_identities);
       auto block_arg = replicate_operand.dyn_cast<BlockArgument>();
       // To qualify for a valid pass-through mirrored variable, it must satisfy
       //   1) it is the body's argument;
@@ -210,7 +209,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
           llvm::any_of(replicate_operand.getUsers(),
                        [&](Operation* user) {
                          return user != body.front().getTerminator() &&
-                                !llvm::isa<TF::IdentityOp>(user) &&
+                                skipped_identities.count(user) == 0 &&
                                 user != replicate;
                        }) ||
           !cond.getArgument(block_arg.getArgNumber()).use_empty()) {
@@ -220,6 +219,11 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
       while_args.push_back(while_op.getOperand(block_arg.getArgNumber()));
     }
     if (while_args.empty()) continue;
+    // Now set the enable_xla_sharding field in the metadata to inform the
+    // compile op.
+    auto metadata_arg = metadata.mutable_args(it->second);
+    metadata_arg->set_enable_xla_sharding(
+        ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED);
     mapping.emplace_back(it->second, std::move(while_args));
   }
   // Sort the mapping according to execute operand order.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index d40eec62cdc..32cb2e02930 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
@@ -60,7 +60,7 @@ void BreakUpIslands::runOnFunction() {
         getOperation().getBody().front().front());
   }
   if (!graph_op) {
-    getOperation().emitError("Expected function to contain only a graph_op");
+    getOperation().emitError("expected function to contain only a graph_op");
     signalPassFailure();
     return;
   }
@@ -239,7 +239,7 @@ void BreakUpIslands::BreakUpIsland(
     } else {
       // TODO(parkers): Any defining op that has a control output can be handled
       // just like an island.
-      fetch.getDefiningOp()->emitError("Fetching non-island as dependency.");
+      fetch.getDefiningOp()->emitError("fetching non-island as dependency");
       return signalPassFailure();
     }
   }
@@ -298,18 +298,21 @@ void BreakUpIslands::BreakUpIsland(
   auto& sink_island_control = sink_island_controls[0];
   island_op.control().replaceAllUsesWith(sink_island_control);
   // All existing outputs need to add sink_island_control as control input.
+  // GraphOp, YieldOp and NextIterationSourceOp don't have control inputs so
+  // exclude them below.
   for (Value out : island_op.outputs()) {
     for (auto& use : out.getUses()) {
       Operation* owner = use.getOwner();
       if (auto other_island_op =
               llvm::dyn_cast<tf_executor::IslandOp>(owner->getParentOp())) {
         (*new_control_inputs)[other_island_op].push_back(sink_island_control);
-      } else if (llvm::isa<tf_executor::FetchOp>(owner) ||
-                 llvm::isa<tf_executor::MergeOp>(owner) ||
-                 llvm::isa<tf_executor::SwitchOp>(owner)) {
+      } else if (owner->getDialect() == island_op.getDialect() &&
+                 !llvm::isa<tf_executor::GraphOp>(owner) &&
+                 !llvm::isa<tf_executor::YieldOp>(owner) &&
+                 !llvm::isa<tf_executor::NextIterationSourceOp>(owner)) {
         (*new_control_inputs)[owner].push_back(sink_island_control);
       } else {
-        use.getOwner()->emitError("Adding control dependency not supported");
+        owner->emitOpError("adding control dependency not supported");
         return signalPassFailure();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
index 672ba418489..b89b3d8e6b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/Value.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
index 96a7fcbb5ba..7755f5f2259 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/Value.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 529c2517508..0ae02ed63b6 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 39fe17800c9..1f0f8e2b9de 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Analysis/Verifier.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
@@ -2516,13 +2516,14 @@ void StructuredValueLinearizer::RecursivelyFindLeaves(
   }
 }
 
-// For exported functions with mutable bound inputs, rewrite the function
-// signature to annotate resource subtypes on the types.
+// For exported functions with bound inputs, rewrite the function
+// signature to match the requirements of tf_saved_model bound input args.
 //
 // The raw imported functions have `tensor<*x!tf.resource>` as the type for
-// mutable bound inputs. Here we turn that into
+// mutable bound inputs and `tensor<...>` as the type for immutable
+// bound inputs. Here we canonicalize both of them into
 // `tensor<!tf.resource<tensor<...>>>`.
-void SetResourceSubtypes(mlir::ModuleOp module) {
+void AdjustBoundInputArgTypes(mlir::ModuleOp module) {
   mlir::SymbolTable symbol_table(module);
   for (auto func : module.getOps<mlir::FuncOp>()) {
     if (!mlir::tf_saved_model::IsExported(func)) continue;
@@ -2532,19 +2533,26 @@ void SetResourceSubtypes(mlir::ModuleOp module) {
       auto arg = func.front().getArgument(i);
       auto global_tensor =
           mlir::tf_saved_model::LookupBoundInput(func, i, symbol_table);
-      if (global_tensor && global_tensor.is_mutable()) {
+      if (global_tensor) {
         auto old_type = arg.getType();
-        auto new_type = mlir::RankedTensorType::get(
-            {}, mlir::TF::ResourceType::get(
-                    {global_tensor.type().cast<mlir::TensorType>()},
-                    module.getContext()));
+        auto new_type =
+            mlir::tf_saved_model::GetBoundInputArgTypeFor(global_tensor);
         arg.setType(new_type);
-        auto arg_with_original_type = builder.create<mlir::TF::CastOp>(
-            global_tensor.getLoc(), old_type, arg,
-            /*Truncate=*/builder.getBoolAttr(false));
-        arg.replaceAllUsesWith(arg_with_original_type);
-        // The RAUW replaces the arg with itself, so we need to set it back.
-        arg_with_original_type.setOperand(arg);
+        if (global_tensor.is_mutable()) {
+          auto arg_with_original_type = builder.create<mlir::TF::CastOp>(
+              global_tensor.getLoc(), old_type, arg,
+              /*Truncate=*/builder.getBoolAttr(false));
+          arg.replaceAllUsesWith(arg_with_original_type);
+          // The RAUW replaces the arg with itself, so we need to set it back.
+          arg_with_original_type.setOperand(arg);
+        } else {
+          auto arg_with_original_type =
+              builder.create<mlir::TF::ReadVariableOp>(global_tensor.getLoc(),
+                                                       old_type, arg);
+          arg.replaceAllUsesWith(arg_with_original_type);
+          // The RAUW replaces the arg with itself, so we need to set it back.
+          arg_with_original_type.setOperand(arg);
+        }
       }
       new_input_types.push_back(arg.getType());
     }
@@ -2793,7 +2801,7 @@ Status CreateSavedModelIR(
           builder.getStrArrayAttr(object_names.GetExportedNames(node_id)));
     }
   }
-  SetResourceSubtypes(module);
+  AdjustBoundInputArgTypes(module);
   module.setAttr("tf_saved_model.semantics", builder.getUnitAttr());
   SortSavedModelModule(module);
   return Status::OK();
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
index 79a302b066b..4a67b7fae76 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
index a97bca9fc3d..2ee3893eac9 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index cce5fde2883..ef1e52ee5c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/OpDefinition.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index a64b7ecfdb3..f8c118ac9d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Identifier.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 65fb8b86ea1..33a09a6ddfb 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -16,23 +16,38 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 #include <string>
 #include <type_traits>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/tpu/topology.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+// Device coordinates are defined as (x, y, core), thus resulting in a rank 3
+// topology.
+constexpr int kTPUTopologyRank = 3;
 
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
 constexpr char kTPUReplicatedCore[] = "TPU_REPLICATED_CORE";
+constexpr char kTopologyAttr[] = "topology";
+constexpr char kDeviceAssignmentAttr[] = "device_assignment";
 
 using Device = DeviceNameUtils::ParsedName;
 using Devices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
@@ -149,12 +164,245 @@ std::string GetTPUCompilationDevice(Device system_device) {
   return DeviceNameUtils::ParsedNameToString(system_device);
 }
 
+// Determines execution devices when topology and device assignment are not
+// defined. This is a special case where a single core computation is replicated
+// to every core in the mesh. TPU devices are simply added to
+// `execution_devices` of one replica. `num_replicas` must be 1 or the total
+// number of TPU devices available, and `num_cores_per_replica` must be 1.
+StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
+    int num_replicas, int num_cores_per_replica,
+    llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices) {
+  const int num_tasks = tpu_devices.size();
+  const int num_tpus_per_task = tpu_devices[0].size();
+  const int num_tpu_devices = num_tasks * num_tpus_per_task;
+
+  if (num_replicas != 1 && num_replicas != num_tpu_devices)
+    return errors::InvalidArgument("'num_replicas' must be equal to 1 or ",
+                                   num_tpu_devices, ", got ", num_replicas);
+
+  if (num_cores_per_replica != 1)
+    return errors::InvalidArgument(
+        "'num_cores_per_replica' must be equal to 1, got ",
+        num_cores_per_replica);
+
+  ExecutionDevices execution_devices;
+  execution_devices.reserve(num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    const int task = i / num_tpus_per_task;
+    const int device = i % num_tpus_per_task;
+    execution_devices.push_back(
+        {tensorflow::DeviceNameUtils::ParsedNameToString(
+            tpu_devices[task][device])});
+  }
+
+  return execution_devices;
+}
+
+// Helper struct for keeping track of task and device for an associated TPU
+// device coordinate.
+struct TaskAndDevice {
+  TaskAndDevice() {}
+  TaskAndDevice(int task, int device) : task(task), device(device) {}
+
+  int task = -1;
+  int device = -1;
+};
+
+// Checks if device coordinate is outside of topology mesh shape bounds.
+bool DeviceCoordinateOutOfBound(int x, int y, int core, int bound_x,
+                                int bound_y, int bound_core) {
+  return x < 0 || x >= bound_x || y < 0 || y >= bound_y || core < 0 ||
+         core >= bound_core;
+}
+
+// Creates error message for an out of bound device coordinate.
+Status DeviceCoordinateErrorMsg(absl::string_view attribute, int x, int y,
+                                int core, int bound_x, int bound_y,
+                                int bound_core) {
+  return errors::InvalidArgument("device coordinate (", x, ", ", y, ", ", core,
+                                 ") in '", attribute,
+                                 "' is outside of mesh shape (", bound_x, ", ",
+                                 bound_y, ", ", bound_core, ")");
+}
+
+// Creates error message for a duplicate device coordinate.
+Status DuplicateCoordinateErrorMsg(absl::string_view attribute, int x, int y,
+                                   int core) {
+  return errors::InvalidArgument("'", attribute,
+                                 "' has duplicate device coordinate (", x, ", ",
+                                 y, ", ", core, ")");
+}
+
+// Parses and validates topology (serialized string of TopologyProto), and maps
+// device coordinate (x, y, core) to task and device (of available TPUs).
+// Topology attribute device coordinates are ordered by task then device (major
+// to minor).
+//
+// A valid TopologyProto must have:
+//  - a valid mesh shape (rank 3 with positive dimensions)
+//  - `num_tasks` and `num_tpu_devices_per_task` must match the number of
+//    available TPU hosts and devices per host
+//  - device coordinates within the mesh shape
+//  - no duplicate device coordinates
+//  - number of device coordinates (in tuple 3) match number of availabe TPUs
+StatusOr<xla::Array3D<TaskAndDevice>> ParseTopologyAttr(
+    llvm::StringRef topology_attr, int num_tasks, int num_tpus_per_task) {
+  tpu::TopologyProto topology_proto;
+  if (!topology_proto.ParseFromString(topology_attr.str()))
+    return errors::InvalidArgument("failed to parse '", kTopologyAttr,
+                                   "' attribute to TopologyProto");
+
+  if (topology_proto.mesh_shape_size() != kTPUTopologyRank)
+    return errors::InvalidArgument(
+        "'", kTopologyAttr, "' 'mesh_shape' must be rank ", kTPUTopologyRank,
+        ", got rank ", topology_proto.mesh_shape_size());
+
+  for (auto mesh_shape_dim : llvm::enumerate(topology_proto.mesh_shape()))
+    if (mesh_shape_dim.value() <= 0)
+      return errors::InvalidArgument(
+          "'", kTopologyAttr, "' 'mesh_shape' dimension ",
+          mesh_shape_dim.index(), " must be positive, got ",
+          mesh_shape_dim.value());
+
+  if (topology_proto.num_tasks() != num_tasks)
+    return errors::InvalidArgument(
+        "number of tasks from available TPU devices must be 'num_tasks' in '",
+        kTopologyAttr, "' (", topology_proto.num_tasks(), "), got ", num_tasks);
+
+  if (topology_proto.num_tpu_devices_per_task() != num_tpus_per_task)
+    return errors::InvalidArgument(
+        "number of TPU devices available per task must be "
+        "'num_tpu_devices_per_task' in '",
+        kTopologyAttr, "' (", topology_proto.num_tpu_devices_per_task(),
+        "), got ", num_tpus_per_task);
+
+  const int expected_device_coordinates_size =
+      num_tasks * num_tpus_per_task * kTPUTopologyRank;
+  if (topology_proto.device_coordinates_size() !=
+      expected_device_coordinates_size)
+    return errors::InvalidArgument(
+        "length of 'device_coordinates' in '", kTopologyAttr,
+        "' must be 'num_tasks' * 'num_tpus_per_task' * ", kTPUTopologyRank,
+        " (", num_tasks, " * ", num_tpus_per_task, " * ", kTPUTopologyRank,
+        "), got ", topology_proto.device_coordinates_size());
+
+  const int bound_x = topology_proto.mesh_shape(0);
+  const int bound_y = topology_proto.mesh_shape(1);
+  const int bound_core = topology_proto.mesh_shape(2);
+
+  xla::Array3D<TaskAndDevice> topology(bound_x, bound_y, bound_core, {});
+  int pos = 0;
+  for (int task = 0; task < num_tasks; ++task) {
+    for (int device = 0; device < num_tpus_per_task; ++device) {
+      int x = topology_proto.device_coordinates(pos++);
+      int y = topology_proto.device_coordinates(pos++);
+      int core = topology_proto.device_coordinates(pos++);
+      if (DeviceCoordinateOutOfBound(x, y, core, bound_x, bound_y, bound_core))
+        return DeviceCoordinateErrorMsg(kTopologyAttr, x, y, core, bound_x,
+                                        bound_y, bound_core);
+
+      auto& task_and_device = topology(x, y, core);
+      if (task_and_device.task != -1)
+        return DuplicateCoordinateErrorMsg(kTopologyAttr, x, y, core);
+
+      task_and_device = {task, device};
+    }
+  }
+
+  return topology;
+}
+
+// Determines execution devices when topology and device assignment are defined.
+// With a topology device coordinate to task and device mapping, device
+// assignment device coordinates can then be mapped to task and device for TPU
+// devices. The device assignment array is also validated.
+//
+// A valid device assignment array must have:
+//  - device coordinates within the topology mesh shape
+//  - no duplicate device coordinates
+//  - number of device coordinates (in tuple 3) match number 'num_replicas' *
+//    'num_cores_per_replica'
+//  - a TPU device associated with each device coordinate
+StatusOr<std::pair<ExecutionDevices, xla::DeviceAssignmentProto>>
+GetGeneralTPUExecutionDeviceAssignment(
+    int num_replicas, int num_cores_per_replica,
+    llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices,
+    llvm::StringRef topology_attr,
+    llvm::ArrayRef<int64_t> device_assignment_attr) {
+  const int num_tasks = tpu_devices.size();
+  const int num_tpus_per_task = tpu_devices[0].size();
+
+  TF_ASSIGN_OR_RETURN(auto topology, ParseTopologyAttr(topology_attr, num_tasks,
+                                                       num_tpus_per_task));
+
+  const int expected_device_assignment_size =
+      num_replicas * num_cores_per_replica * kTPUTopologyRank;
+  if (device_assignment_attr.size() != expected_device_assignment_size)
+    return errors::InvalidArgument(
+        "length of '", kDeviceAssignmentAttr,
+        "' must be 'num_replicas' * 'num_cores_per_replica' * ",
+        kTPUTopologyRank, " (", num_replicas, " * ", num_cores_per_replica,
+        " * ", kTPUTopologyRank, "), got ", device_assignment_attr.size());
+
+  const int bound_x = topology.n1();
+  const int bound_y = topology.n2();
+  const int bound_core = topology.n3();
+
+  // TPU XLA device ID is determined by its device coordinate, from major to
+  // minor coordinates (y, x, core).
+  auto location_to_id = [&](int x, int y, int core) {
+    return x * bound_core + y * bound_x * bound_core + core;
+  };
+
+  std::vector<bool> used_device_ids(
+      location_to_id(bound_x - 1, bound_y - 1, bound_core - 1), false);
+  ExecutionDevices execution_devices(
+      num_replicas,
+      llvm::SmallVector<std::string, 8>(num_cores_per_replica, ""));
+  xla::DeviceAssignment device_assignment(num_replicas, num_cores_per_replica);
+  int pos = 0;
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    for (int logical_core = 0; logical_core < num_cores_per_replica;
+         ++logical_core) {
+      int x = device_assignment_attr[pos++];
+      int y = device_assignment_attr[pos++];
+      int core = device_assignment_attr[pos++];
+      if (DeviceCoordinateOutOfBound(x, y, core, bound_x, bound_y, bound_core))
+        return DeviceCoordinateErrorMsg(kDeviceAssignmentAttr, x, y, core,
+                                        bound_x, bound_y, bound_core);
+
+      TaskAndDevice task_and_device = topology(x, y, core);
+      const int task = task_and_device.task;
+      const int device = task_and_device.device;
+      if (task == -1 || device == -1)
+        return errors::InvalidArgument(
+            "no TPU device found for '", kDeviceAssignmentAttr,
+            "' device coordinate (", x, ", ", y, ", ", core, ")");
+
+      const int device_id = location_to_id(x, y, core);
+      if (used_device_ids[device_id])
+        return DuplicateCoordinateErrorMsg(kDeviceAssignmentAttr, x, y, core);
+
+      used_device_ids[device_id] = true;
+      device_assignment(replica, logical_core) = device_id;
+      execution_devices[replica][logical_core] =
+          DeviceNameUtils::ParsedNameToString(tpu_devices[task][device]);
+    }
+  }
+
+  xla::DeviceAssignmentProto device_assignment_proto;
+  TF_RETURN_IF_ERROR(device_assignment.Serialize(&device_assignment_proto));
+
+  return std::pair<ExecutionDevices, xla::DeviceAssignmentProto>(
+      std::move(execution_devices), std::move(device_assignment_proto));
+}
+
 }  // anonymous namespace
 
-Status GetTPUCompilationAndExecutionDevices(
+StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     Devices devices, int num_replicas, int num_cores_per_replica,
-    std::string* compilation_device,
-    llvm::SmallVectorImpl<std::string>* execution_devices) {
+    llvm::StringRef topology_attr,
+    llvm::ArrayRef<int64_t> device_assignment_attr) {
   // Collect TPU_SYSTEM devices.
   llvm::SmallVector<Device, 8> system_devices;
   TF_RETURN_IF_ERROR(GetTPUSystemDevices(devices, &system_devices));
@@ -163,32 +411,32 @@ Status GetTPUCompilationAndExecutionDevices(
   llvm::SmallVector<llvm::SmallVector<Device, 8>, 8> tpu_devices;
   TF_RETURN_IF_ERROR(GetTPUDevices(devices, system_devices, &tpu_devices));
 
-  const int num_tasks = tpu_devices.size();
-  const int num_tpus_per_task = tpu_devices[0].size();
-  const int num_tpu_devices = num_tasks * num_tpus_per_task;
+  std::string compilation_device = GetTPUCompilationDevice(system_devices[0]);
 
-  if (num_replicas != 1 && num_replicas != num_tpu_devices)
-    return errors::Unimplemented("num_replicas must be equal to 1 or ",
-                                 num_tpu_devices, ", got ", num_replicas);
+  if (topology_attr.empty()) {
+    if (!device_assignment_attr.empty())
+      return errors::InvalidArgument("'", kDeviceAssignmentAttr,
+                                     "' must not be set when '", kTopologyAttr,
+                                     "' is not set");
 
-  *compilation_device = GetTPUCompilationDevice(system_devices[0]);
-
-  // TODO(lyandy): Update `execution_devices` to be 2D when support for explicit
-  // topologies is added.
-  execution_devices->reserve(num_replicas);
-  for (int i = 0; i < num_replicas; ++i) {
-    const int task = i / num_tpus_per_task;
-    const int device = i % num_tpus_per_task;
-    execution_devices->push_back(
-        tensorflow::DeviceNameUtils::ParsedNameToString(
-            tpu_devices[task][device]));
+    TF_ASSIGN_OR_RETURN(auto execution_devices,
+                        GetFullMeshTPUExecutionDeviceAssignment(
+                            num_replicas, num_cores_per_replica, tpu_devices));
+    return TPUDeviceAssignment(compilation_device,
+                               std::move(execution_devices));
   }
 
-  return Status::OK();
+  TF_ASSIGN_OR_RETURN(auto devices_and_ids,
+                      GetGeneralTPUExecutionDeviceAssignment(
+                          num_replicas, num_cores_per_replica, tpu_devices,
+                          topology_attr, device_assignment_attr));
+  return TPUDeviceAssignment(compilation_device,
+                             std::move(devices_and_ids.first),
+                             std::move(devices_and_ids.second));
 }
 
 std::string GetDeviceAliasForLogicalCore(int core_index) {
-  return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index);
+  return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index).str();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index eb8386c0f64..dd296a13f4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -19,45 +19,201 @@ limitations under the License.
 #include <string>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
-// Finds the TPU compilation device and execution devices from `devices` for a
-// replicated TPU computation subgraph. Compilation device is determined from
-// looking up all TPU_SYSTEM:0 devices and choosing the CPU device associated
-// to the first TPU_SYSTEM device sorted lexicographically by replica and task.
-// Execution devices are determined by looking up all TPU devices associated
-// with each TPU_SYSTEM:0 device found. A failure will be returned if it is not
-// possible (e.g. invalid devices).
-//
-// For example, with `num_replicas` = 4 and `devices`:
-//   /job:localhost/replica:0/task:0/device:CPU:0
-//   /job:worker/replica:0/task:0/device:CPU:0
-//   /job:worker/replica:0/task:0/device:TPU_SYSTEM:0
-//   /job:worker/replica:0/task:0/device:TPU:0
-//   /job:worker/replica:0/task:0/device:TPU:1
-//   /job:worker/replica:0/task:1/device:CPU:0
-//   /job:worker/replica:0/task:1/device:TPU_SYSTEM:0
-//   /job:worker/replica:0/task:1/device:TPU:0
-//   /job:worker/replica:0/task:1/device:TPU:1
-//
-// The compilation device will be:
-//   /job:worker/replica:0/task:0/device:CPU:0
-//
-// and the execution devices (sorted) will be:
-//   /job:worker/replica:0/task:0/device:TPU:0
-//   /job:worker/replica:0/task:0/device:TPU:1
-//   /job:worker/replica:0/task:1/device:TPU:0
-//   /job:worker/replica:0/task:1/device:TPU:1
-Status GetTPUCompilationAndExecutionDevices(
-    llvm::ArrayRef<DeviceNameUtils::ParsedName> devices, int num_replicas,
-    int num_cores_per_replica, std::string* compilation_device,
-    llvm::SmallVectorImpl<std::string>* execution_devices);
+using stream_executor::port::StatusOr;
 
-// Virutal device is used for evice assignment for executing ops
-// on a specified logical core.
+// TPU devices to be used for execution (e.g. devices for TPUExecute ops). They
+// are ordered by `num_replicas` followed by `num_cores_per_replica`.
+using ExecutionDevices =
+    llvm::SmallVector<llvm::SmallVector<std::string, 8>, 8>;
+
+// TPU compilation device, execution devices, and optionally execution device
+// IDs. Execution device IDs are populated if `topology` and `device_assignment`
+// are provided.
+struct TPUDeviceAssignment {
+  TPUDeviceAssignment(llvm::StringRef compilation_device,
+                      ExecutionDevices&& execution_devices)
+      : compilation_device(compilation_device),
+        execution_devices(std::move(execution_devices)) {}
+
+  TPUDeviceAssignment(llvm::StringRef compilation_device,
+                      ExecutionDevices&& execution_devices,
+                      xla::DeviceAssignmentProto&& xla_device_assignment)
+      : compilation_device(compilation_device),
+        execution_devices(std::move(execution_devices)),
+        xla_device_assignment(std::move(xla_device_assignment)) {}
+
+  std::string compilation_device;
+  ExecutionDevices execution_devices;
+  llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
+};
+
+// Finds the TPU compilation device and execution devices from `devices` for a
+// TPU computation subgraph. Compilation device is determined from looking up
+// all TPU_SYSTEM:0 devices and choosing the CPU device associated to the first
+// TPU_SYSTEM device sorted lexicographically by replica and task. Execution
+// devices are determined by looking up all TPU devices associated with each
+// TPU_SYSTEM:0 device found, alongside associated `topology_attr` and
+// `device_assignment_attr`. If `topology_attr` not an empty string (parsable to
+// TopologyProto), `device_assignment_attr` must not be empty also. When
+// `topology_attr` and `device_assignment_attr` are not empty, a general device
+// assignment based on those two attributes are used. Otherwise when
+// `topology_attr` and `device_assignment_attr` are empty, a full mesh device
+// assignment is used instead. A failure will be returned if it is not possible
+// (e.g. invalid devices or invalid parameters).
+//
+//
+// For example, for `devices`:
+//   {
+//     /job:localhost/replica:0/task:0/device:CPU:0,
+//     /job:worker/replica:0/task:0/device:CPU:0,
+//     /job:worker/replica:0/task:0/device:TPU_SYSTEM:0,
+//     /job:worker/replica:0/task:0/device:TPU:0,
+//     /job:worker/replica:0/task:0/device:TPU:1,
+//     /job:worker/replica:0/task:0/device:TPU:2,
+//     /job:worker/replica:0/task:0/device:TPU:3,
+//     /job:worker/replica:0/task:1/device:CPU:0,
+//     /job:worker/replica:0/task:1/device:TPU_SYSTEM:0,
+//     /job:worker/replica:0/task:1/device:TPU:0,
+//     /job:worker/replica:0/task:1/device:TPU:1,
+//     /job:worker/replica:0/task:1/device:TPU:2,
+//     /job:worker/replica:0/task:1/device:TPU:3
+//   }
+//
+//
+// With the following parameters (full mesh device assignment):
+//   `num_replicas` = 8
+//   `num_cores_per_replica` = 1
+//   `topology_attr` = ""
+//   `device_assignment_attr` = {}
+//
+// The `compilation_device` will be:
+//   /job:worker/replica:0/task:0/device:CPU:0
+//
+// `execution_devices` will be:
+//   {
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:0
+//     },
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:1
+//     },
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:2
+//     },
+//     {
+//       /job:worker/replica:0/task:0/device:TPU:3
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:0
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:1
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:2
+//     },
+//     {
+//       /job:worker/replica:0/task:1/device:TPU:3
+//     }
+//   }
+//
+// and `xla_device_assignment` will not be set.
+//
+//
+// With the following parameters (general device assignment):
+//   `num_replicas` = 4
+//   `num_cores_per_replica` = 2
+//   `topology_attr` (in proto debug string format) =
+//     {
+//       mesh_shape: 2
+//       mesh_shape: 2
+//       mesh_shape: 2
+//       num_tasks: 2
+//       num_tpu_devices_per_task: 4
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 1
+//       device_coordinates: 1
+//       device_coordinates: 0
+//       device_coordinates: 0
+//       device_coordinates: 1
+//     }
+//   `device_assignment` =
+//     {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1}
+//
+// The `compilation_device` will be:
+//   /job:worker/replica:0/task:0/device:CPU:0
+//
+// `execution_devices` will be:
+//   {
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:0",
+//       "/job:worker/replica:0/task:1/device:TPU:3"
+//     },
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:1",
+//       "/job:worker/replica:0/task:1/device:TPU:2"
+//     },
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:3",
+//       "/job:worker/replica:0/task:1/device:TPU:0"
+//     },
+//     {
+//       "/job:worker/replica:0/task:0/device:TPU:2",
+//       "/job:worker/replica:0/task:1/device:TPU:1"
+//     }
+//   }
+//
+// and `xla_device_assignment` will be:
+//   {
+//     replica_count: 4
+//     computation_count: 2
+//     computation_devices {
+//       replica_device_ids: 0
+//       replica_device_ids: 4
+//       replica_device_ids: 2
+//       replica_device_ids: 6
+//     }
+//     computation_devices {
+//       replica_device_ids: 1
+//       replica_device_ids: 5
+//       replica_device_ids: 3
+//       replica_device_ids: 7
+//     }
+//   }
+StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
+    llvm::ArrayRef<DeviceNameUtils::ParsedName> devices, int num_replicas,
+    int num_cores_per_replica, llvm::StringRef topology_attr,
+    llvm::ArrayRef<int64_t> device_assignment_attr);
+
+// Virtual device is used for evice assignment for executing ops on a specified
+// logical core.
 std::string GetDeviceAliasForLogicalCore(int core_index);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 8abc3e7bcdf..de7009b495f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -15,14 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
-#include <string>
+#include <cstdint>
 #include <tuple>
-#include <utility>
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/tpu/topology.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -51,14 +50,14 @@ struct ParameterizedDeviceSetTest
 TEST_P(ParameterizedDeviceSetTest, BadDeviceSet) {
   llvm::SmallVector<Device, 8> devices;
   ASSERT_TRUE(DeviceNamesToParsedNames(std::get<0>(GetParam()), &devices));
-  std::string compilation_device;
-  llvm::SmallVector<std::string, 8> execution_devices;
+  std::string topology_attr;
+  std::vector<int64_t> device_assignment_attr;
 
-  Status s = GetTPUCompilationAndExecutionDevices(
-      devices, /*num_replicas=*/1, /*num_cores_per_replica=*/1,
-      &compilation_device, &execution_devices);
-  ASSERT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(), std::get<1>(GetParam()));
+  auto status_or = GetTPUCompilationAndExecutionDevices(
+      devices, /*num_replicas=*/1, /*num_cores_per_replica=*/1, topology_attr,
+      device_assignment_attr);
+  ASSERT_FALSE(status_or.ok());
+  EXPECT_EQ(status_or.status().error_message(), std::get<1>(GetParam()));
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -85,59 +84,441 @@ INSTANTIATE_TEST_SUITE_P(
             "expected the number of TPU devices per host to be 2, got 1")));
 
 struct ParameterizedMetadataTest
-    : ::testing::TestWithParam<std::tuple<int, int, std::string>> {};
+    : ::testing::TestWithParam<std::tuple<int, int, std::string,
+                                          std::vector<int64_t>, std::string>> {
+};
 
 TEST_P(ParameterizedMetadataTest, BadMetadata) {
   llvm::SmallVector<Device, 8> devices;
   ASSERT_TRUE(DeviceNamesToParsedNames(
       {"/job:worker/replica:0/task:0/device:TPU_SYSTEM:0",
        "/job:worker/replica:0/task:0/device:TPU:0",
-       "/job:worker/replica:0/task:0/device:TPU:1",
        "/job:worker/replica:0/task:1/device:TPU_SYSTEM:0",
-       "/job:worker/replica:0/task:1/device:TPU:0",
-       "/job:worker/replica:0/task:1/device:TPU:1"},
+       "/job:worker/replica:0/task:1/device:TPU:0"},
       &devices));
   std::string compilation_device;
-  llvm::SmallVector<std::string, 8> execution_devices;
+  llvm::SmallVector<llvm::SmallVector<std::string, 8>, 8> execution_devices;
+  llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
 
-  Status s = GetTPUCompilationAndExecutionDevices(
+  auto status_or = GetTPUCompilationAndExecutionDevices(
       devices, std::get<0>(GetParam()), std::get<1>(GetParam()),
-      &compilation_device, &execution_devices);
-  ASSERT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(), std::get<2>(GetParam()));
+      std::get<2>(GetParam()), std::get<3>(GetParam()));
+  ASSERT_FALSE(status_or.ok());
+  EXPECT_EQ(status_or.status().error_message(), std::get<4>(GetParam()));
+}
+
+std::string TopologyWithMeshShape(llvm::ArrayRef<int> mesh_shape) {
+  tpu::TopologyProto topology_proto;
+  for (int mesh_dim : mesh_shape) topology_proto.add_mesh_shape(mesh_dim);
+  return topology_proto.SerializeAsString();
+}
+
+std::string TopologyWithMeshShapeAndTasks(llvm::ArrayRef<int> mesh_shape,
+                                          int num_tasks,
+                                          int num_tpu_devices_per_task) {
+  tpu::TopologyProto topology_proto;
+  for (int mesh_dim : mesh_shape) topology_proto.add_mesh_shape(mesh_dim);
+  topology_proto.set_num_tasks(num_tasks);
+  topology_proto.set_num_tpu_devices_per_task(num_tpu_devices_per_task);
+  return topology_proto.SerializeAsString();
+}
+
+std::string TopologyWithDeviceCoordinates(
+    llvm::ArrayRef<int> device_coordinates) {
+  tpu::TopologyProto topology_proto;
+  topology_proto.add_mesh_shape(2);
+  topology_proto.add_mesh_shape(1);
+  topology_proto.add_mesh_shape(1);
+  topology_proto.set_num_tasks(2);
+  topology_proto.set_num_tpu_devices_per_task(1);
+  for (int device_coordinate : device_coordinates)
+    topology_proto.add_device_coordinates(device_coordinate);
+  return topology_proto.SerializeAsString();
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    BadMetadata, ParameterizedMetadataTest,
+    BadFullMeshMetadata, ParameterizedMetadataTest,
     ::testing::Values(
-        std::make_tuple(8, 1, "num_replicas must be equal to 1 or 4, got 8")));
+        std::make_tuple(
+            2, 1, "", std::vector<int64_t>{0},
+            "'device_assignment' must not be set when 'topology' is not set"),
+        std::make_tuple(8, 1, "", std::vector<int64_t>(),
+                        "'num_replicas' must be equal to 1 or 2, got 8"),
+        std::make_tuple(2, 2, "", std::vector<int64_t>(),
+                        "'num_cores_per_replica' must be equal to 1, got 2")));
+
+INSTANTIATE_TEST_SUITE_P(
+    BadGeneralTopologyMetadata, ParameterizedMetadataTest,
+    ::testing::Values(
+        std::make_tuple(
+            2, 1, "BAD_TOPOLOGY", std::vector<int64_t>(),
+            "failed to parse 'topology' attribute to TopologyProto"),
+        std::make_tuple(4, 2, TopologyWithMeshShape({0}),
+                        std::vector<int64_t>(),
+                        "'topology' 'mesh_shape' must be rank 3, got rank 1"),
+        std::make_tuple(
+            2, 1, TopologyWithMeshShape({2, 0, 2}), std::vector<int64_t>(),
+            "'topology' 'mesh_shape' dimension 1 must be positive, got 0"),
+        std::make_tuple(2, 1, TopologyWithMeshShapeAndTasks({1, 1, 1}, 1, 1),
+                        std::vector<int64_t>(),
+                        "number of tasks from available TPU devices must be "
+                        "'num_tasks' in 'topology' (1), got 2"),
+        std::make_tuple(2, 1, TopologyWithMeshShapeAndTasks({1, 1, 1}, 2, 2),
+                        std::vector<int64_t>(),
+                        "number of TPU devices available per task must be "
+                        "'num_tpu_devices_per_task' in 'topology' (2), got 1"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({}), std::vector<int64_t>(),
+            "length of 'device_coordinates' in 'topology' must be 'num_tasks' "
+            "* 'num_tpus_per_task' * 3 (2 * 1 * 3), got 0"),
+        std::make_tuple(2, 1,
+                        TopologyWithDeviceCoordinates({-1, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "device coordinate (-1, 0, 0) in 'topology' is outside "
+                        "of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({2, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "device coordinate (2, 0, 0) in 'topology' is outside "
+                        "of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1,
+                        TopologyWithDeviceCoordinates({0, -1, 0, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "device coordinate (0, -1, 0) in 'topology' is outside "
+                        "of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 1, 0, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "device coordinate (0, 1, 0) in 'topology' is outside "
+                        "of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1,
+                        TopologyWithDeviceCoordinates({0, 0, -1, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "device coordinate (0, 0, -1) in 'topology' is outside "
+                        "of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 1, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "device coordinate (0, 0, 1) in 'topology' is outside "
+                        "of mesh shape (2, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "'topology' has duplicate device coordinate (0, 0, 0)")));
+
+INSTANTIATE_TEST_SUITE_P(
+    BadGeneralDeviceAssignmentMetadata, ParameterizedMetadataTest,
+    ::testing::Values(
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>(),
+                        "length of 'device_assignment' must be 'num_replicas' "
+                        "* 'num_cores_per_replica' * 3 (2 * 1 * 3), got 0"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>{-1, 0, 0, 0, 0, 0},
+                        "device coordinate (-1, 0, 0) in 'device_assignment' "
+                        "is outside of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>{2, 0, 0, 0, 0, 0},
+                        "device coordinate (2, 0, 0) in 'device_assignment' is "
+                        "outside of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>{0, -1, 0, 0, 0, 0},
+                        "device coordinate (0, -1, 0) in 'device_assignment' "
+                        "is outside of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>{0, 1, 0, 0, 0, 0},
+                        "device coordinate (0, 1, 0) in 'device_assignment' is "
+                        "outside of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>{0, 0, -1, 0, 0, 0},
+                        "device coordinate (0, 0, -1) in 'device_assignment' "
+                        "is outside of mesh shape (2, 1, 1)"),
+        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+                        std::vector<int64_t>{0, 0, 1, 0, 0, 0},
+                        "device coordinate (0, 0, 1) in 'device_assignment' is "
+                        "outside of mesh shape (2, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+            std::vector<int64_t>{0, 0, 0, 0, 0, 0},
+            "'device_assignment' has duplicate device coordinate (0, 0, 0)")));
+
+std::vector<std::string> MakeDeviceSet(int num_tasks,
+                                       int num_devices_per_task) {
+  std::vector<std::string> devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0"};
+  devices.reserve(num_tasks * num_devices_per_task + num_tasks + 1);
+
+  for (int task = 0; task < num_tasks; ++task) {
+    devices.push_back(
+        llvm::formatv("/job:worker/replica:0/task:{0}/device:CPU:0", task)
+            .str());
+    devices.push_back(
+        llvm::formatv("/job:worker/replica:0/task:{0}/device:TPU_SYSTEM:0",
+                      task)
+            .str());
+    for (int device = 0; device < num_devices_per_task; ++device)
+      devices.push_back(
+          llvm::formatv("/job:worker/replica:0/task:{0}/device:TPU:{1}", task,
+                        device)
+              .str());
+  }
+
+  return devices;
+}
+
+TEST(TPURewriteDeviceUtilTest,
+     BadGeneralDeviceAssignmentMetadataMissingDevice) {
+  tpu::TopologyProto topology_proto;
+  {
+    topology_proto.add_mesh_shape(2);
+    topology_proto.add_mesh_shape(1);
+    topology_proto.add_mesh_shape(1);
+    topology_proto.set_num_tasks(1);
+    topology_proto.set_num_tpu_devices_per_task(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+  }
+
+  std::string topology_attr = topology_proto.SerializeAsString();
+  std::vector<int64_t> device_assignment_attr{1, 0, 0};
 
-TEST(TPURewriteDeviceUtilTest, NumReplicasNumTPUs) {
   llvm::SmallVector<Device, 8> devices;
-  ASSERT_TRUE(DeviceNamesToParsedNames(
-      {"/job:localhost/replica:0/task:0/device:CPU:0",
-       "/job:worker/replica:0/task:1/device:TPU_SYSTEM:0",
-       "/job:worker/replica:0/task:1/device:TPU:1",
-       "/job:worker/replica:0/task:0/device:TPU:0",
-       "/job:worker/replica:0/task:1/device:CPU:0",
-       "/job:worker/replica:0/task:0/device:TPU:1",
-       "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0",
-       "/job:worker/replica:0/task:1/device:TPU:0",
-       "/job:worker/replica:0/task:0/device:CPU:0"},
-      &devices));
-  std::string compilation_device;
-  llvm::SmallVector<std::string, 8> execution_devices;
+  std::vector<std::string> device_names =
+      MakeDeviceSet(/*num_tasks=*/1, /*num_devices_per_task=*/1);
+  ASSERT_TRUE(DeviceNamesToParsedNames(device_names, &devices));
 
-  TF_EXPECT_OK(GetTPUCompilationAndExecutionDevices(
-      devices, /*num_replicas=*/4, /*num_cores_per_replica=*/1,
-      &compilation_device, &execution_devices));
+  auto status_or = GetTPUCompilationAndExecutionDevices(
+      devices, /*num_replicas=*/1, /*num_cores_per_replica=*/1, topology_attr,
+      device_assignment_attr);
 
-  EXPECT_EQ(compilation_device, "/job:worker/replica:0/task:0/device:CPU:0");
+  ASSERT_FALSE(status_or.ok());
+  EXPECT_EQ(status_or.status().error_message(),
+            "no TPU device found for 'device_assignment' device coordinate (1, "
+            "0, 0)");
+}
+
+TEST(TPURewriteDeviceUtilTest, ValidFullMeshDeviceAssignment) {
+  llvm::SmallVector<Device, 8> devices;
+  std::vector<std::string> device_names =
+      MakeDeviceSet(/*num_tasks=*/2, /*num_devices_per_task=*/4);
+  ASSERT_TRUE(DeviceNamesToParsedNames(device_names, &devices));
+  std::string topology_attr;
+  std::vector<int64_t> device_assignment_attr;
+
+  auto status_or = GetTPUCompilationAndExecutionDevices(
+      devices, /*num_replicas=*/8, /*num_cores_per_replica=*/1, topology_attr,
+      device_assignment_attr);
+
+  TF_ASSERT_OK(status_or.status());
+
+  auto& tpu_device_assignment = status_or.ValueOrDie();
+  EXPECT_EQ(tpu_device_assignment.compilation_device,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  auto& execution_devices = tpu_device_assignment.execution_devices;
+  ASSERT_EQ(execution_devices.size(), 8);
+  for (const auto& replica_execution_device : execution_devices)
+    ASSERT_EQ(replica_execution_device.size(), 1);
+
+  EXPECT_EQ(execution_devices[0][0],
+            "/job:worker/replica:0/task:0/device:TPU:0");
+  EXPECT_EQ(execution_devices[1][0],
+            "/job:worker/replica:0/task:0/device:TPU:1");
+  EXPECT_EQ(execution_devices[2][0],
+            "/job:worker/replica:0/task:0/device:TPU:2");
+  EXPECT_EQ(execution_devices[3][0],
+            "/job:worker/replica:0/task:0/device:TPU:3");
+  EXPECT_EQ(execution_devices[4][0],
+            "/job:worker/replica:0/task:1/device:TPU:0");
+  EXPECT_EQ(execution_devices[5][0],
+            "/job:worker/replica:0/task:1/device:TPU:1");
+  EXPECT_EQ(execution_devices[6][0],
+            "/job:worker/replica:0/task:1/device:TPU:2");
+  EXPECT_EQ(execution_devices[7][0],
+            "/job:worker/replica:0/task:1/device:TPU:3");
+
+  EXPECT_FALSE(tpu_device_assignment.xla_device_assignment.hasValue());
+}
+
+TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
+  tpu::TopologyProto topology_proto;
+  {
+    topology_proto.add_mesh_shape(2);
+    topology_proto.add_mesh_shape(2);
+    topology_proto.add_mesh_shape(2);
+    topology_proto.set_num_tasks(2);
+    topology_proto.set_num_tpu_devices_per_task(4);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+  }
+
+  std::string topology_attr = topology_proto.SerializeAsString();
+  std::vector<int64_t> device_assignment_attr{
+      0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1};
+
+  llvm::SmallVector<Device, 8> devices;
+  std::vector<std::string> device_names =
+      MakeDeviceSet(/*num_tasks=*/2, /*num_devices_per_task=*/4);
+  ASSERT_TRUE(DeviceNamesToParsedNames(device_names, &devices));
+
+  auto status_or = GetTPUCompilationAndExecutionDevices(
+      devices, /*num_replicas=*/4, /*num_cores_per_replica=*/2, topology_attr,
+      device_assignment_attr);
+
+  TF_ASSERT_OK(status_or.status());
+
+  auto& tpu_device_assignment = status_or.ValueOrDie();
+  EXPECT_EQ(tpu_device_assignment.compilation_device,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  auto& execution_devices = tpu_device_assignment.execution_devices;
   ASSERT_EQ(execution_devices.size(), 4);
-  EXPECT_EQ(execution_devices[0], "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1], "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[2], "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[3], "/job:worker/replica:0/task:1/device:TPU:1");
+  for (const auto& replica_execution_device : execution_devices)
+    ASSERT_EQ(replica_execution_device.size(), 2);
+
+  EXPECT_EQ(execution_devices[0][0],
+            "/job:worker/replica:0/task:0/device:TPU:0");
+  EXPECT_EQ(execution_devices[0][1],
+            "/job:worker/replica:0/task:1/device:TPU:3");
+  EXPECT_EQ(execution_devices[1][0],
+            "/job:worker/replica:0/task:0/device:TPU:1");
+  EXPECT_EQ(execution_devices[1][1],
+            "/job:worker/replica:0/task:1/device:TPU:2");
+  EXPECT_EQ(execution_devices[2][0],
+            "/job:worker/replica:0/task:0/device:TPU:3");
+  EXPECT_EQ(execution_devices[2][1],
+            "/job:worker/replica:0/task:1/device:TPU:0");
+  EXPECT_EQ(execution_devices[3][0],
+            "/job:worker/replica:0/task:0/device:TPU:2");
+  EXPECT_EQ(execution_devices[3][1],
+            "/job:worker/replica:0/task:1/device:TPU:1");
+
+  auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
+  ASSERT_TRUE(xla_device_assignment.hasValue());
+  EXPECT_EQ(xla_device_assignment->replica_count(), 4);
+  EXPECT_EQ(xla_device_assignment->computation_count(), 2);
+  ASSERT_EQ(xla_device_assignment->computation_devices_size(), 2);
+  const auto& computation_device_0 =
+      xla_device_assignment->computation_devices(0);
+  ASSERT_EQ(computation_device_0.replica_device_ids_size(), 4);
+  const auto& computation_device_1 =
+      xla_device_assignment->computation_devices(1);
+  ASSERT_EQ(computation_device_1.replica_device_ids_size(), 4);
+
+  EXPECT_EQ(computation_device_0.replica_device_ids(0), 0);
+  EXPECT_EQ(computation_device_0.replica_device_ids(1), 4);
+  EXPECT_EQ(computation_device_0.replica_device_ids(2), 2);
+  EXPECT_EQ(computation_device_0.replica_device_ids(3), 6);
+  EXPECT_EQ(computation_device_1.replica_device_ids(0), 1);
+  EXPECT_EQ(computation_device_1.replica_device_ids(1), 5);
+  EXPECT_EQ(computation_device_1.replica_device_ids(2), 3);
+  EXPECT_EQ(computation_device_1.replica_device_ids(3), 7);
+}
+
+TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x3) {
+  tpu::TopologyProto topology_proto;
+  {
+    topology_proto.add_mesh_shape(1);
+    topology_proto.add_mesh_shape(2);
+    topology_proto.add_mesh_shape(3);
+    topology_proto.set_num_tasks(3);
+    topology_proto.set_num_tpu_devices_per_task(2);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(2);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(2);
+  }
+
+  std::string topology_attr = topology_proto.SerializeAsString();
+  std::vector<int64_t> device_assignment_attr{0, 0, 1, 0, 1, 1, 0, 0, 2,
+                                              0, 1, 2, 0, 0, 0, 0, 1, 0};
+
+  llvm::SmallVector<Device, 8> devices;
+  std::vector<std::string> device_names =
+      MakeDeviceSet(/*num_tasks=*/3, /*num_devices_per_task=*/2);
+  ASSERT_TRUE(DeviceNamesToParsedNames(device_names, &devices));
+
+  auto status_or = GetTPUCompilationAndExecutionDevices(
+      devices, /*num_replicas=*/2, /*num_cores_per_replica=*/3, topology_attr,
+      device_assignment_attr);
+
+  TF_ASSERT_OK(status_or.status());
+
+  auto& tpu_device_assignment = status_or.ValueOrDie();
+  EXPECT_EQ(tpu_device_assignment.compilation_device,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+
+  auto& execution_devices = tpu_device_assignment.execution_devices;
+  ASSERT_EQ(execution_devices.size(), 2);
+  for (const auto& replica_execution_device : execution_devices)
+    ASSERT_EQ(replica_execution_device.size(), 3);
+
+  EXPECT_EQ(execution_devices[0][0],
+            "/job:worker/replica:0/task:1/device:TPU:1");
+  EXPECT_EQ(execution_devices[0][1],
+            "/job:worker/replica:0/task:1/device:TPU:0");
+  EXPECT_EQ(execution_devices[0][2],
+            "/job:worker/replica:0/task:2/device:TPU:0");
+  EXPECT_EQ(execution_devices[1][0],
+            "/job:worker/replica:0/task:2/device:TPU:1");
+  EXPECT_EQ(execution_devices[1][1],
+            "/job:worker/replica:0/task:0/device:TPU:0");
+  EXPECT_EQ(execution_devices[1][2],
+            "/job:worker/replica:0/task:0/device:TPU:1");
+
+  auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
+  ASSERT_TRUE(xla_device_assignment.hasValue());
+  EXPECT_EQ(xla_device_assignment->replica_count(), 2);
+  EXPECT_EQ(xla_device_assignment->computation_count(), 3);
+  ASSERT_EQ(xla_device_assignment->computation_devices_size(), 3);
+  const auto& computation_device_0 =
+      xla_device_assignment->computation_devices(0);
+  ASSERT_EQ(computation_device_0.replica_device_ids_size(), 2);
+  const auto& computation_device_1 =
+      xla_device_assignment->computation_devices(1);
+  ASSERT_EQ(computation_device_1.replica_device_ids_size(), 2);
+  const auto& computation_device_2 =
+      xla_device_assignment->computation_devices(2);
+  ASSERT_EQ(computation_device_2.replica_device_ids_size(), 2);
+
+  EXPECT_EQ(computation_device_0.replica_device_ids(0), 1);
+  EXPECT_EQ(computation_device_0.replica_device_ids(1), 5);
+  EXPECT_EQ(computation_device_1.replica_device_ids(0), 4);
+  EXPECT_EQ(computation_device_1.replica_device_ids(1), 0);
+  EXPECT_EQ(computation_device_2.replica_device_ids(0), 2);
+  EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index d3b7215d26d..bf2d8103872 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -142,6 +142,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_shape_derivation",
+    srcs = [],
+    hdrs = ["transforms/hlo_shape_derivation.h"],
+    deps = [
+        ":hlo",
+        ":lhlo",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "lhlo_legalize_to_affine",
     srcs = ["transforms/lhlo_legalize_to_affine.cc"],
@@ -204,6 +217,7 @@ cc_library(
     deps = [
         ":lhlo",
         "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:support",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
@@ -217,9 +231,9 @@ cc_library(
     srcs = ["transforms/hlo_legalize_to_lhlo.cc"],
     deps = [
         ":hlo",
+        ":hlo_shape_derivation",
         ":lhlo",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index bc9bdf49a39..f00e880f36b 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
index f8eabeb046d..82304f95e33 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/xla/hlo_module_importer.h"
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
 #include "mlir/IR/OperationSupport.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 481c12b42c2..b011b6069c7 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -177,31 +177,6 @@ void ConstOp::build(Builder* builder, OperationState& result, Attribute value) {
 // IotaOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  const auto output_type = getResult().getType().cast<ShapedType>();
-  const auto output_size = output_type.getNumElements();
-  const auto dimension = iota_dimension().getSExtValue();
-  const auto max_dim_size = output_type.getDimSize(dimension);
-  int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
-
-  llvm::SmallVector<APInt, 10> values;
-  values.reserve(output_size);
-
-  int64_t increase_stride = output_size;
-  for (int i = 0; i <= dimension; i++) {
-    increase_stride /= output_type.getDimSize(i);
-  }
-
-  int64_t current_value = 0;
-  for (int i = 0; i < output_size; i++) {
-    int64_t value = (current_value / increase_stride) % max_dim_size;
-    values.push_back(APInt(bitwidth, value));
-    ++current_value;
-  }
-
-  return DenseIntElementsAttr::get(output_type, values);
-}
-
 static LogicalResult Verify(IotaOp op) {
   auto shape = op.getType().cast<ShapedType>();
   if (!shape.hasRank()) return success();
@@ -1155,7 +1130,7 @@ Type SliceOp::InferOutputTypes(Builder* builder, Value operand,
   // Illegal attributes.
   ShapedType attr_ty = start_indices.getType();
   if (attr_ty.getRank() != 1 || attr_ty.getNumElements() != rank ||
-      !attr_ty.getElementType().isInteger(64) ||
+      !attr_ty.getElementType().isSignlessInteger(64) ||
       limit_indices.getType() != attr_ty || strides.getType() != attr_ty)
     return ty;
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 28c0a859f7d..42b42d99380 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -52,7 +52,7 @@ def HLO_FpTensor : TensorOf<[AnyFloat]>;
 
 def HLO_PredTensor : TensorOf<[HLO_Pred]>;
 
-def HLO_Tensor : TensorOf<[AnyFloat, AnyInteger, AnyComplex]>;
+def HLO_Tensor : TensorOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
 
 def HLO_ComplexTensor : TensorOf<[AnyComplex]>;
 
@@ -64,13 +64,13 @@ def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 // an index type (as it stores indices) but that is currently disallowed in
 // MLIR.
 def HLO_DimensionTensor : ShapedContainerType<
-    [AnyInteger], And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
+    [AnySignlessInteger], And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
     "a 1D tensor of dimensions">;
 
 // In general, static shaped tensor constraints should be avoided unless
 // it is for a legacy op which is only correct with static shapes.
 def HLO_StaticShapeTensor : StaticShapeTensorOf<[
-      AnyFloat, AnyInteger, AnyComplex]>;
+      AnyFloat, AnySignlessInteger, AnyComplex]>;
 
 //===----------------------------------------------------------------------===//
 // XLA combined type definitions.
@@ -122,8 +122,6 @@ def HLO_IotaOp : HLO_Op<"iota", [NoSideEffect]>, BASE_HLO_IotaOp {
 
   let results = (outs HLO_IntFpOrComplexTensor:$output);
 
-  let hasFolder = 1;
-
   // TODO(b/130357376): Iota has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
 }
@@ -786,7 +784,7 @@ def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
     compute shape arguments to dynamic operations.
   }];
 
-  let arguments = (ins Variadic<AnyInteger>);
+  let arguments = (ins Variadic<AnySignlessInteger>);
   let results = (outs HLO_DimensionTensor);
 
   // Cannot be exported to legacy formats.
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
index 120b035e5d0..3e3570f5b54 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
@@ -40,7 +40,7 @@ static ElementsAttr getSplat(Builder* b, Value val, T constant) {
 
   // Handle integer elements.
   Attribute elementAttr;
-  if (valElementType.isa<IntegerType>())
+  if (valElementType.isSignlessInteger())
     elementAttr = b->getIntegerAttr(valElementType, constant);
   else if (valElementType.isa<FloatType>())
     elementAttr = b->getFloatAttr(valElementType, constant);
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 794fee181a6..3a675f20d92 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -42,7 +42,7 @@ def LHLO_PredBuffer : MemRefOf<[HLO_Pred]>;
 // Any integer or floating-point tensor types
 def LHLO_IntOrFpBuffer : MemRefOf<[HLO_Int, AnyFloat]>;
 
-def LHLO_Buffer : MemRefOf<[AnyFloat, AnyInteger]>;
+def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger]>;
 
 def LHLO_TupleBuffer : NestedTupleOf<[LHLO_Buffer]>;
 
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 8fa7d809024..92614755ec3 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
@@ -1221,7 +1221,7 @@ LogicalResult AddDynamicParameterBindings(mlir::ModuleOp module,
                  << "requires arg " << padding_arg_index
                  << " to be a scalar for use as a dynamic parameter";
 
-      if (!mlir::getElementTypeOrSelf(padding_arg_type).isa<IntegerType>())
+      if (!mlir::getElementTypeOrSelf(padding_arg_type).isSignlessInteger())
         return entry_func.emitError()
                << "requires arg " << padding_arg_index
                << " to be of an int type for use as a dynamic parameter";
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index fa39b77918a..2232063fd6a 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -49,6 +49,14 @@ func @complex_collapse_fold(%arg0: tensor<4xcomplex<f32>>) -> tensor<4xcomplex<f
   return %2 : tensor<4xcomplex<f32>>
 }
 
+// CHECK-LABEL: @iota_not_lowered_to_constant
+func @iota_not_lowered_to_constant() -> tensor<4xi32> {
+  // CHECK: [[RESULT:%.*]] = "xla_hlo.iota"
+  // CHECK: return [[RESULT]]
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
 // CHECK-LABEL: @unary_einsum
 func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   // CHECK: %[[ONE:.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 4b2d76e586a..be6f0e6a949 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -hlo-legalize-to-lhlo -lhlo-redundant-copies-removal %s -o - | FileCheck %s --dump-input=always
+// RUN: tf-opt -hlo-legalize-to-lhlo -lhlo-redundant-copies-removal -split-input-file %s -o - | FileCheck %s -dump-input-on-failure
 
 // CHECK-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -11,6 +11,8 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @func_op
 func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
@@ -20,6 +22,8 @@ func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
 
+// -----
+
 // CHECK-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
@@ -45,6 +49,8 @@ func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
 
+// -----
+
 // CHECK-LABEL: func @remove_lhlo_copy_op_created_from_tensor_store
 func @remove_lhlo_copy_op_created_from_tensor_store(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: memref<f32>) {
   %0 = "xla_hlo.max"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -58,6 +64,8 @@ func @remove_lhlo_copy_op_created_from_tensor_store(%arg0: tensor<f32>, %arg1: t
 // CHECK-NOT: dealloc %[[ALLOC_OPERAND]] : memref<f32>
 // CHECK: "xla_lhlo.terminator"() : () -> ()
 
+// -----
+
 // CHECK-LABEL: func @fusion
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -77,6 +85,8 @@ func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
   "xla_lhlo.terminator"() : () -> ()
 }
 
+// -----
+
 // CHECK-LABEL: func @copy
 func @copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -87,6 +97,8 @@ func @copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @exp
 func @exp(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -97,6 +109,8 @@ func @exp(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @select
 func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
              %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -110,6 +124,8 @@ func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @compare
 func @compare(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2xi1>) {
   %tensor_lhs = tensor_load %lhs : memref<2x2xf32>
@@ -122,6 +138,8 @@ func @compare(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2x
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @broadcast
 func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   %tensor_operand = tensor_load %operand : memref<5xf32>
@@ -133,6 +151,8 @@ func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @dyn_broadcast
 func @dyn_broadcast(%operand: memref<?x?xf32>) {
   %tensor_operand = tensor_load %operand : memref<?x?xf32>
@@ -157,6 +177,8 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @iota
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "xla_hlo.iota"()
@@ -166,6 +188,8 @@ func @iota(%result: memref<10xi32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @abs
 func @abs(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -176,6 +200,8 @@ func @abs(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @ceil
 func @ceil(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -186,6 +212,8 @@ func @ceil(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @convert
 func @convert(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -196,6 +224,8 @@ func @convert(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @cos
 func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -206,6 +236,8 @@ func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @neg
 func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -216,6 +248,8 @@ func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @sign
 func @sign(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -226,6 +260,8 @@ func @sign(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @tanh
 func @tanh(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -236,6 +272,8 @@ func @tanh(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @remainder
 func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_lhs = tensor_load %lhs : memref<2x2xf32>
@@ -246,3 +284,47 @@ func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
+
+// -----
+
+// Dynamic shape binary element-wise operation.
+// CHECK-LABEL: func @add_dyn
+func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
+  %result = "xla_hlo.add"(%lhs, %rhs)
+      : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[DIM0:.*]] = dim %arg0, 0 : memref<?x?xf32>
+  // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
+  // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
+  // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
+  // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
+  // CHECK: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<2xi64>
+  // CHECK: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
+  // CHECK: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
+  // CHECK: "xla_lhlo.add"(%arg0, %arg1, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
+  return
+}
+
+// -----
+
+// Dynamic shape unary element-wise operation.
+// CHECK-LABEL: func @tanh_dyn
+func @tanh_dyn(%arg0: tensor<?x?xf32>) {
+  %result = "xla_hlo.tanh"(%arg0)
+      : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[DIM0:.*]] = dim %arg0, 0 : memref<?x?xf32>
+  // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
+  // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
+  // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
+  // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
+  // CHECK: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<2xi64>
+  // CHECK: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
+  // CHECK: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
+  // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index b5242d06dae..61add8c4389 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -213,3 +213,106 @@ func @select(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>,
 // CHECK-NEXT: ^bb0(%[[PRED_IN:.*]]: i1, %[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32):
 // CHECK-NEXT:   %[[RESULT:.*]] = select %[[PRED_IN]], %[[LHS_IN]], %[[RHS_IN]] : f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, 0)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
+// CHECK-LABEL: func @broadcast
+func @broadcast(%operand: tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32> {
+  %0 = "xla_hlo.broadcast_in_dim"(%operand)
+         {broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>}
+         : (tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32>
+  return %0 : tensor<7x10x6x4x5xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> (0)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @broadcast_scalar
+func @broadcast_scalar(%operand: tensor<f32>) -> tensor<7x10x6xf32> {
+  %0 = "xla_hlo.broadcast_in_dim"(%operand)
+        {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+        : (tensor<f32>) -> tensor<7x10x6xf32>
+  return %0 : tensor<7x10x6xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3, d2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @transpose
+func @transpose(%arg0: tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32> {
+  %0 = "xla_hlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}
+        : (tensor<2x3x9x5xi32>) -> tensor<3x2x5x9xi32>
+  return %0 : tensor<3x2x5x9xi32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, 0, d1)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reshape_3D_2D
+func @reshape_3D_2D(%arg0: tensor<12x1x42xi32>) -> tensor<12x42xi32> {
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<12x1x42xi32>) -> tensor<12x42xi32>
+  return %0 : tensor<12x42xi32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1, 0, 0)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reshape_4D_2D
+func @reshape_4D_2D(%arg0: tensor<12x42x1x1xi32>) -> tensor<12x42xi32> {
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<12x42x1x1xi32>) -> tensor<12x42xi32>
+  return %0 : tensor<12x42xi32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape_2D_4D
+func @reshape_2D_4D(%arg0: tensor<12x42xi32>) -> tensor<12x1x42x1xi32> {
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<12x42xi32>) -> tensor<12x1x42x1xi32>
+  return %0 : tensor<12x1x42x1xi32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-LABEL: func @minf
+func @minf(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %0 = "xla_hlo.min"(%lhs, %rhs)
+          : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32):
+// CHECK-NEXT:   %[[CMP:.*]] = cmpf "olt", %[[LHS_IN]], %[[RHS_IN]] : f32
+// CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[LHS_IN]], %[[RHS_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @maxi
+func @maxi(%lhs: tensor<2x2xi32>, %rhs: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  %0 = "xla_hlo.max"(%lhs, %rhs)
+          : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: i32, %[[RHS_IN:.*]]: i32):
+// CHECK-NEXT:   %[[CMP:.*]] = cmpi "sgt", %[[LHS_IN]], %[[RHS_IN]] : i32
+// CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[LHS_IN]], %[[RHS_IN]] : i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
diff --git a/tensorflow/compiler/mlir/xla/tests/iota.mlir b/tensorflow/compiler/mlir/xla/tests/iota.mlir
deleted file mode 100644
index 65b9f73ba67..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/iota.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: tf-opt %s -split-input-file -xla-legalize-to-std | FileCheck %s
-
-// -----
-
-// CHECK-LABEL: func @iota.const.1() -> tensor<4xi32> {
-func @iota.const.1() -> tensor<4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.2() -> tensor<2x4xi32> {
-func @iota.const.2() -> tensor<2x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1]]> : tensor<2x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.3() -> tensor<2x4xi32> {
-func @iota.const.3() -> tensor<2x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]> : tensor<2x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.4() -> tensor<2x3x4xi32> {
-func @iota.const.4() -> tensor<2x3x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0{{\]\]}}, {{\[\[}}1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]> : tensor<2x3x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
-  return %0 : tensor<2x3x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.5() -> tensor<2x3x4xi32> {
-func @iota.const.5() -> tensor<2x3x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2{{\]\]}}, {{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]]> : tensor<2x3x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
-  return %0 : tensor<2x3x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.6() -> tensor<2x3x4xi32> {
-func @iota.const.6() -> tensor<2x3x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3{{\]\]}}, {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x3x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
-  return %0 : tensor<2x3x4xi32>
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
index 1d2cf767939..f56174ae075 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
@@ -135,3 +135,51 @@ func @float_constant() -> (tensor<f32>, tensor<2x3xf32>, tensor<2x3xf32>) {
   return %0, %1, %2: tensor<f32>, tensor<2x3xf32>, tensor<2x3xf32>
 }
 
+// Test Iota lowering to constant
+// CHECK-LABEL: func @iota.const.1() -> tensor<4xi32> {
+func @iota.const.1() -> tensor<4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.2() -> tensor<2x4xi32> {
+func @iota.const.2() -> tensor<2x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1]]> : tensor<2x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
+  return %0 : tensor<2x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.3() -> tensor<2x4xi32> {
+func @iota.const.3() -> tensor<2x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]> : tensor<2x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
+  return %0 : tensor<2x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.4() -> tensor<2x3x4xi32> {
+func @iota.const.4() -> tensor<2x3x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0{{\]\]}}, {{\[\[}}1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]> : tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
+  return %0 : tensor<2x3x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.5() -> tensor<2x3x4xi32> {
+func @iota.const.5() -> tensor<2x3x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2{{\]\]}}, {{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]]> : tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
+  return %0 : tensor<2x3x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.6() -> tensor<2x3x4xi32> {
+func @iota.const.6() -> tensor<2x3x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3{{\]\]}}, {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
+  return %0 : tensor<2x3x4xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index a9ffc116392..7f7e37ebe66 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -1,6 +1,6 @@
 // RUN: tf-opt -lhlo-fuse-linalg %s -o - | FileCheck %s --dump-input=always
-// RUN: tf-opt -lhlo-fuse-linalg -tile-sizes-for-linalg-fusion=2,3 %s -o - | FileCheck %s -check-prefix=TILED --dump-input-on-failure
-// RUN: tf-opt -lhlo-fuse-linalg -tile-to-parallel-loops-for-linalg-fusion %s -o - | FileCheck %s -check-prefix=PLOOP --dump-input-on-failure
+// RUN: tf-opt -lhlo-fuse-linalg=tile-sizes=2,3 %s -o - | FileCheck %s -check-prefix=TILED --dump-input-on-failure
+// RUN: tf-opt -lhlo-fuse-linalg=use-parallel-loops %s -o - | FileCheck %s -check-prefix=PLOOP --dump-input-on-failure
 
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index 78f0d9ffb18..33d5884a882 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -210,6 +210,7 @@ func @broadcast(%operand: memref<5x7x1xf32>, %result: memref<7x10x6x4x5xf32>) {
 
 // -----
 
+// CHECK-DAG: #[[OPERAND_MPA:.*]] = affine_map<(d0, d1, d2) -> (0)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @broadcast_scalar
 func @broadcast_scalar(%operand: memref<f32>, %result: memref<7x10x6xf32>) {
@@ -218,10 +219,9 @@ func @broadcast_scalar(%operand: memref<f32>, %result: memref<7x10x6xf32>) {
     : (memref<f32>, memref<7x10x6xf32>) -> ()
   return
 }
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[RESULT:.*]]: f32):
-// CHECK-NEXT: %[[CONST:.*]] = load %{{.*}} : memref<f32>
-// CHECK-NEXT:   linalg.yield %[[CONST]] : f32
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:[a-zA-Z0-9_]*]]: f32, %[[RESULT:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
 // -----
 
@@ -450,3 +450,36 @@ func @slice(%operand: memref<?x?xf32>, %result: memref<?x?xf32>) {
 // CHECK: %[[RHS:.*]] = linalg.range %[[R0]] : %[[R2]] : %[[R1]]
 // CHECK: %[[RESULT:.*]] = linalg.slice %[[IN]][%[[LHS]], %[[RHS]]]
 // CHECK: linalg.copy(%[[RESULT]], %[[OUT]])
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, 0, d1)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reshape_3D_2D
+func @reshape_3D_2D(%arg0: memref<12x1x42xi32>, %arg1 : memref<12x42xi32>) {
+  "xla_lhlo.reshape"(%arg0, %arg1) : (memref<12x1x42xi32>, memref<12x42xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1, 0, 0)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reshape_4D_2D
+func @reshape_4D_2D(%arg0: memref<12x42x1x1xi32>, %arg1 : memref<12x42xi32>) {
+  "xla_lhlo.reshape"(%arg0, %arg1) : (memref<12x42x1x1xi32>, memref<12x42xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape_2D_4D
+func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
+  "xla_lhlo.reshape"(%arg0, %arg1) : (memref<12x42xi32>, memref<12x1x42x1xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 77c361a8ab5..29d399c68fa 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file implements logic for lowering HLO dialect to LHLO dialect.
 
 #include "absl/memory/memory.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 
@@ -127,9 +128,24 @@ class HloToLhloOpConverter : public ConversionPattern {
       ConversionPatternRewriter& rewriter) const final {
     const auto& original_results = op->getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
-    for (auto result : original_results) {
-      buffer_args.push_back(
-          InsertAllocAndDealloc(op->getLoc(), result, &rewriter));
+    for (auto result : llvm::enumerate(original_results)) {
+      RankedTensorType resultType =
+          result.value().getType().dyn_cast<RankedTensorType>();
+      if (!resultType) {
+        return matchFailure();
+      }
+      if (resultType.hasStaticShape()) {
+        buffer_args.push_back(
+            InsertAllocAndDealloc(op->getLoc(), result.value(), &rewriter));
+      } else {
+        Value shape_value = ShapeDerivation<HloOpTy>::impl::deriveShapeFromOp(
+            op, result.index(), &rewriter);
+        if (!shape_value) {
+          return matchFailure();
+        }
+        buffer_args.push_back(InsertDynamicAllocAndDealloc(
+            op->getLoc(), result.value(), shape_value, &rewriter));
+      }
     }
     rewriter.create<LhloOpTy>(op->getLoc(), llvm::None, buffer_args,
                               op->getAttrs());
@@ -320,6 +336,7 @@ struct HloLegalizeToLhlo : public ModulePass<HloLegalizeToLhlo> {
     target.addIllegalOp<mlir::TensorLoadOp>();
     target.addIllegalOp<mlir::TensorStoreOp>();
     target.addLegalOp<ModuleTerminatorOp>();
+    target.addLegalOp<ScalarsToDimensionTensorOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h b/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h
new file mode 100644
index 00000000000..d2a1f47e540
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h
@@ -0,0 +1,130 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+// This file contains implementations for shape derivation functions that,
+// given some operation and a result number, produce IR that computes the
+// shape of the given result at runtime based on operands of the provided
+// operation.
+// These should be generated at some point based on annotations on the HLO
+// using the new shape dialect. While this is still in the works, we hardcode
+// the expected IR here to unblock progress.
+// The implementation is based on templates to allow for using these derivation
+// functions in templated code.
+
+namespace impl {
+
+struct UnknownShape {
+  // Default shape derivation function that simply fails with a runtime error.
+  static Value deriveShapeFromOp(Operation* op, int operand_position,
+                                 ConversionPatternRewriter* rewriter) {
+    op->emitOpError()
+        << "dynamic result shapes cannot be derived for this operation";
+    return {};
+  }
+};
+
+struct SameShapeAsFirstOperand {
+  // Shape derivation function that computes the shape of the result based on
+  // the first argument. For a 2-dimensional input tensor, this produces IR of
+  // the form
+  //
+  //  %0 = dim %arg0, 0 : memref<?x?xf32>
+  //  %1 = index_cast %0 : index to i64
+  //  %2 = dim %arg0, 1 : memref<?x?xf32>
+  //  %3 = index_cast %2 : index to i64
+  //  %4 = "xla_hlo.scalars_to_dimension_tensor"(%1, %3)
+  //    : (i64, i64) -> tensor<2xi64>
+  //
+  // and returns %4 as the shape value.
+  static Value deriveShapeFromOp(Operation* op, int result_postion,
+                                 ConversionPatternRewriter* rewriter) {
+    Value operand = op->getOperand(0);
+    ShapedType operand_type = operand.getType().dyn_cast<ShapedType>();
+    if (!operand_type) {
+      op->emitOpError() << "first operand has no shaped type";
+      return {};
+    }
+    auto loc = op->getLoc();
+    SmallVector<Value, 4> shape_values;
+    shape_values.reserve(operand_type.getRank());
+    auto shape_scalar_type = rewriter->getIntegerType(64);
+    for (auto element : llvm::enumerate(operand_type.getShape())) {
+      if (element.value() == ShapedType::kDynamicSize) {
+        Value dim = rewriter->create<DimOp>(loc, operand, element.index());
+        shape_values.push_back(
+            rewriter->create<IndexCastOp>(loc, dim, shape_scalar_type));
+      } else {
+        shape_values.push_back(rewriter->create<ConstantOp>(
+            loc, rewriter->getI64IntegerAttr(element.value())));
+      }
+    }
+    return rewriter->create<ScalarsToDimensionTensorOp>(
+        loc, RankedTensorType::get({operand_type.getRank()}, shape_scalar_type),
+        shape_values);
+  }
+};
+
+}  // namespace impl
+
+// Default template to cover HLO operations whose shape derivation is unknown.
+template <typename HloOpTy>
+struct ShapeDerivation {
+  using impl = impl::UnknownShape;
+};
+
+// Element-wise operations that have the shape of their first operand.
+
+#define SAME_SHAPE_AS_FIRST_OPERAND(Op)         \
+  template <>                                   \
+  struct ShapeDerivation<Op> {                  \
+    using impl = impl::SameShapeAsFirstOperand; \
+  };
+
+SAME_SHAPE_AS_FIRST_OPERAND(AbsOp)
+SAME_SHAPE_AS_FIRST_OPERAND(AddOp)
+SAME_SHAPE_AS_FIRST_OPERAND(AndOp)
+SAME_SHAPE_AS_FIRST_OPERAND(CeilOp)
+SAME_SHAPE_AS_FIRST_OPERAND(CosOp)
+SAME_SHAPE_AS_FIRST_OPERAND(DivOp)
+SAME_SHAPE_AS_FIRST_OPERAND(ExpOp)
+SAME_SHAPE_AS_FIRST_OPERAND(MaxOp)
+SAME_SHAPE_AS_FIRST_OPERAND(MinOp)
+SAME_SHAPE_AS_FIRST_OPERAND(MulOp)
+SAME_SHAPE_AS_FIRST_OPERAND(NegOp)
+SAME_SHAPE_AS_FIRST_OPERAND(RemOp)
+SAME_SHAPE_AS_FIRST_OPERAND(SubOp)
+SAME_SHAPE_AS_FIRST_OPERAND(TanhOp)
+
+#undef SAME_SHAPE_AS_FIRST_OPERAND
+
+}  // namespace xla_hlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
index 8351f94d172..72ea2e18ec0 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index da135ea1860..8f955d6944a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/Dialect/Traits.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Diagnostics.h"  // TF:llvm-project
@@ -119,7 +119,7 @@ static DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
 Type GetSumAccumulationType(Type input_type) {
   MLIRContext *ctx = input_type.getContext();
   if (input_type.isBF16() || input_type.isF16()) return FloatType::getF32(ctx);
-  if (input_type.isInteger(8) || input_type.isInteger(16))
+  if (input_type.isSignlessInteger(8) || input_type.isSignlessInteger(16))
     return IntegerType::get(32, ctx);
   return input_type;
 }
@@ -1274,7 +1274,7 @@ class ConvertMaxPoolOp : public OpRewritePattern<TF::MaxPoolOp> {
                                      PatternRewriter &rewriter) const override {
     Type element_type =
         op.input().getType().cast<TensorType>().getElementType();
-    if (!element_type.isIntOrFloat()) return matchFailure();
+    if (!element_type.isSignlessIntOrFloat()) return matchFailure();
     Location loc = op.getLoc();
     ConstOp init = GetMinValueForType(element_type, loc, &rewriter);
 
@@ -2248,7 +2248,7 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
     Type input_element_type = input_type.getElementType();
     // TODO(bixia): Clarify whether tf.ArgMax supports complex data types. If
     // tf.ArgMax doesn't support complex data types, this check can be removed.
-    if (!input_element_type.isIntOrFloat()) return this->matchFailure();
+    if (!input_element_type.isSignlessIntOrFloat()) return this->matchFailure();
 
     Location loc = op.getLoc();
     Value init_value =
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 58e98a881e9..265466ef3a4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/iterator_range.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 872a288c259..519ba9235f1 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the legalization pattern definition file for TF to XLA.
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
@@ -504,8 +504,8 @@ def : Pat<(TF_SignOp $x),
           )>;
 
 def BothElementTypesSameWidthIntOrFloat : Constraint<CPred<
-  "getElementTypeOrSelf($0.getType()).isIntOrFloat() && "
-  "getElementTypeOrSelf($1.getType()).isIntOrFloat() && "
+  "getElementTypeOrSelf($0.getType()).isSignlessIntOrFloat() && "
+  "getElementTypeOrSelf($1.getType()).isSignlessIntOrFloat() && "
   "getElementTypeOrSelf($0.getType()).getIntOrFloatBitWidth() == "
   "getElementTypeOrSelf($1.getType()).getIntOrFloatBitWidth()">,
   "element types must be integers or floats of same width">;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
index 5e12abc466c..3c15f0be7e8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file implements logic for lowering XLA dialect to Standard dialect.
 
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
@@ -24,12 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 
-using mlir::Builder;
-using mlir::FunctionPass;
-using mlir::OpPassBase;
-using mlir::OwningRewritePatternList;
-using mlir::PassRegistration;
-
 namespace mlir {
 namespace {
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_to_standard.inc"
@@ -37,27 +31,25 @@ namespace {
 namespace xla_hlo {
 namespace {
 
-struct CompareIConvert : public RewritePattern {
-  explicit CompareIConvert(MLIRContext *context)
-      : RewritePattern("xla_hlo.compare", 1, context) {}
+class CompareIConvert : public OpRewritePattern<xla_hlo::CompareOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(xla_hlo::CompareOp op,
                                      PatternRewriter &rewriter) const override {
-    auto compare_op = cast<CompareOp>(op);
-
-    auto lhs = compare_op.lhs();
-    auto rhs = compare_op.rhs();
+    auto lhs = op.lhs();
+    auto rhs = op.rhs();
     auto lhs_type = lhs.getType().cast<TensorType>();
     auto rhs_type = rhs.getType().cast<TensorType>();
 
     // Broadcasting not supported by this rewrite.
     if (lhs_type.getShape() != rhs_type.getShape()) return matchFailure();
 
-    if (!lhs_type.getElementType().isa<IntegerType>() ||
-        !rhs_type.getElementType().isa<IntegerType>())
+    if (!lhs_type.getElementType().isSignlessInteger() ||
+        !rhs_type.getElementType().isSignlessInteger())
       return matchFailure();
 
-    auto comparison_direction = compare_op.comparison_direction();
+    auto comparison_direction = op.comparison_direction();
     auto compare_predicate =
         llvm::StringSwitch<Optional<CmpIPredicate>>(comparison_direction)
             .Case("EQ", CmpIPredicate::eq)
@@ -76,16 +68,14 @@ struct CompareIConvert : public RewritePattern {
   }
 };
 
-struct CompareFConvert : public RewritePattern {
-  explicit CompareFConvert(MLIRContext *context)
-      : RewritePattern("xla_hlo.compare", 1, context) {}
+class CompareFConvert : public OpRewritePattern<xla_hlo::CompareOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(xla_hlo::CompareOp op,
                                      PatternRewriter &rewriter) const override {
-    auto compare_op = cast<CompareOp>(op);
-
-    auto lhs = compare_op.lhs();
-    auto rhs = compare_op.rhs();
+    auto lhs = op.lhs();
+    auto rhs = op.rhs();
     auto lhs_type = lhs.getType().cast<TensorType>();
     auto rhs_type = rhs.getType().cast<TensorType>();
 
@@ -96,7 +86,7 @@ struct CompareFConvert : public RewritePattern {
         !rhs_type.getElementType().isa<FloatType>())
       return matchFailure();
 
-    auto comparison_direction = compare_op.comparison_direction();
+    auto comparison_direction = op.comparison_direction();
     CmpFPredicate compare_predicate =
         llvm::StringSwitch<CmpFPredicate>(comparison_direction)
             .Case("EQ", CmpFPredicate::OEQ)
@@ -115,9 +105,43 @@ struct CompareFConvert : public RewritePattern {
   }
 };
 
+class ConvertIotaOp : public OpRewritePattern<xla_hlo::IotaOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(xla_hlo::IotaOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto output_type = op.getType().cast<ShapedType>();
+    // TODO(prakalps): Handle FP and ComplexType iota ops.
+    if (!output_type.getElementType().isSignlessInteger())
+      return matchFailure();
+    auto output_size = output_type.getNumElements();
+    auto dimension = op.iota_dimension().getSExtValue();
+    auto max_dim_size = output_type.getDimSize(dimension);
+    int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
+
+    llvm::SmallVector<APInt, 10> values;
+    values.reserve(output_size);
+
+    int64_t increase_stride = output_size;
+    for (int i = 0; i <= dimension; i++) {
+      increase_stride /= output_type.getDimSize(i);
+    }
+
+    int64_t current_value = 0;
+    for (int i = 0; i < output_size; i++) {
+      int64_t value = (current_value / increase_stride) % max_dim_size;
+      values.push_back(APInt(bitwidth, value));
+      ++current_value;
+    }
+
+    rewriter.replaceOpWithNewOp<mlir::ConstantOp>(
+        op, DenseIntElementsAttr::get(output_type, values));
+    return matchSuccess();
+  }
+};
+
 }  // end anonymous namespace
-}  // end namespace xla_hlo
-}  // end namespace mlir
 
 namespace {
 struct LegalizeToStandard : public FunctionPass<LegalizeToStandard> {
@@ -126,17 +150,14 @@ struct LegalizeToStandard : public FunctionPass<LegalizeToStandard> {
 };
 }  // end anonymous namespace
 
-std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>>
-mlir::xla_hlo::createLegalizeToStdPass() {
+std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>> createLegalizeToStdPass() {
   return std::make_unique<LegalizeToStandard>();
 }
 
-void mlir::xla_hlo::PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
-                                             mlir::MLIRContext *ctx) {
+void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
+                              mlir::MLIRContext *ctx) {
   mlir::populateWithGenerated(ctx, patterns);
-  patterns
-      ->insert<mlir::xla_hlo::CompareFConvert, mlir::xla_hlo::CompareIConvert>(
-          ctx);
+  patterns->insert<CompareFConvert, CompareIConvert, ConvertIotaOp>(ctx);
 }
 
 /// Perform the lowering to standard dialect.
@@ -148,3 +169,6 @@ void LegalizeToStandard::runOnFunction() {
 
 static PassRegistration<LegalizeToStandard> legalize_pass(
     "xla-legalize-to-std", "Legalize from XLA dialect to standard dialect");
+
+}  // end namespace xla_hlo
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
index a15b28193cd..c0f6c2c3541 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
@@ -16,7 +16,7 @@ limitations under the License.
 // This is the legalization pattern definition file for XLA to StandardOps.
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index 6b2b548550a..a52d2318ba7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -18,31 +18,26 @@ limitations under the License.
 
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // TF:llvm-project
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> tile_to_parallel_loops_for_linalg_fusion(
-    "tile-to-parallel-loops-for-linalg-fusion",
-    llvm::cl::desc(
-        "Tiles GenericOp consumer to parallel loops before linalg fusion"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::list<unsigned> tile_sizes_for_linalg_fusion(
-    "tile-sizes-for-linalg-fusion",
-    llvm::cl::desc(
-        "Tile sizes by which to tile linalg generic before linalg fusion"),
-    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated);
-
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 
 using linalg::LinalgOp;
 
-struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
+class LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
+ public:
+  LhloFuseLinalg() = default;
+  LhloFuseLinalg(const LhloFuseLinalg&) {}
+  LhloFuseLinalg(bool use_parallel_loops, llvm::ArrayRef<unsigned> tile_sizes) {
+    tile_sizes_->assign(tile_sizes.begin(), tile_sizes.end());
+    use_parallel_loops_.setValue(use_parallel_loops);
+  }
+
   void runOnFunction() override {
     auto func = getFunction();
 
@@ -64,8 +59,8 @@ struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
     OpBuilder b(func);
     OperationFolder folder(func.getContext());
     func.walk([&](linalg::GenericOp generic_op) {
-      SmallVector<int64_t, 2> tile_sizes(tile_sizes_for_linalg_fusion.begin(),
-                                         tile_sizes_for_linalg_fusion.end());
+      SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
+                                         tile_sizes_.end());
       if (tile_sizes.empty()) {
         tile_sizes =
             SmallVector<int64_t, 2>(generic_op.getNumInputsAndOutputs(), 1);
@@ -105,13 +100,25 @@ struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
   bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b,
                      OperationFolder* folder) {
     auto tiled_generic_op =
-        tile_to_parallel_loops_for_linalg_fusion
+        use_parallel_loops_
             ? linalg::tileLinalgOpToParallelLoops(*b, op, tile_sizes,
                                                   /*permutation=*/{}, folder)
             : linalg::tileLinalgOp(*b, op, tile_sizes,
                                    /*permutation=*/{}, folder);
     return tiled_generic_op.hasValue();
   }
+
+  Option<bool> use_parallel_loops_{
+      *this, "use-parallel-loops",
+      llvm::cl::desc(
+          "Tiles GenericOp consumer to parallel loops before linalg fusion"),
+      llvm::cl::init(false)};
+
+  ListOption<unsigned> tile_sizes_{
+      *this, "tile-sizes",
+      llvm::cl::desc(
+          "Tile sizes by which to tile linalg generic before linalg fusion"),
+      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
 };
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index b0f6b83038a..2c550465302 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index e991b186d72..c9245d93e56 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/Dialect/GPU/GPUDialect.h"  // TF:llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // TF:llvm-project
 #include "mlir/Dialect/LoopOps/LoopOps.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
index d8a5ae6c6de..dcb0ab20e9e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
@@ -17,7 +17,7 @@ limitations under the License.
 // equivalent real value operations.
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/StandardOps/Ops.td"
+include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/lower_general_dot.cc b/tensorflow/compiler/mlir/xla/transforms/lower_general_dot.cc
index c956cd6b277..f18607dfffb 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lower_general_dot.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lower_general_dot.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index b7b807333ba..6554942954e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 
@@ -167,7 +167,14 @@ inline Value MapXlaOpToStdScalarOp<xla_hlo::AndOp>(xla_hlo::AndOp xla_op,
       xla_op.getLoc(), result_types, args, b);
 }
 
-inline Optional<CmpFPredicate> getFloatCmpPredicate(
+template <typename PredicateType>
+inline Optional<PredicateType> getCmpPredicate(
+    StringRef xla_comparison_direction) {
+  return llvm::None;
+}
+
+template <>
+inline Optional<CmpFPredicate> getCmpPredicate<CmpFPredicate>(
     StringRef xla_comparison_direction) {
   return llvm::StringSwitch<CmpFPredicate>(xla_comparison_direction)
       .Case("EQ", CmpFPredicate::OEQ)
@@ -179,7 +186,8 @@ inline Optional<CmpFPredicate> getFloatCmpPredicate(
       .Default(CmpFPredicate::NumPredicates);
 }
 
-inline Optional<CmpIPredicate> getIntCmpPredicate(
+template <>
+inline Optional<CmpIPredicate> getCmpPredicate<CmpIPredicate>(
     StringRef xla_comparison_direction) {
   return llvm::StringSwitch<Optional<CmpIPredicate>>(xla_comparison_direction)
       .Case("EQ", CmpIPredicate::eq)
@@ -198,16 +206,16 @@ inline Value MapXlaCompareOpToStdScalarOp(XLACompareOpTy xla_op,
   const auto& lhs = args[0];
   const auto& rhs = args[1];
   Type element_type = lhs.getType();
-  if (element_type.isa<IntegerType>()) {
+  if (element_type.isSignlessInteger()) {
     Optional<CmpIPredicate> predicate =
-        getIntCmpPredicate(xla_op.comparison_direction());
+        getCmpPredicate<CmpIPredicate>(xla_op.comparison_direction());
     assert(predicate.hasValue() && "expected valid comparison direction");
     return b->create<ScalarIOp<XLACompareOpTy>>(xla_op.getLoc(),
                                                 predicate.getValue(), lhs, rhs);
   }
   if (element_type.isa<FloatType>()) {
     Optional<CmpFPredicate> predicate =
-        getFloatCmpPredicate(xla_op.comparison_direction());
+        getCmpPredicate<CmpFPredicate>(xla_op.comparison_direction());
     assert(predicate.hasValue() && "expected valid comparison direction");
     return b->create<ScalarFOp<XLACompareOpTy>>(xla_op.getLoc(),
                                                 predicate.getValue(), lhs, rhs);
@@ -280,8 +288,8 @@ template <>
 inline Value MapXlaOpToStdScalarOp<xla_lhlo::ConvertOp>(
     xla_lhlo::ConvertOp xla_op, ArrayRef<Type> result_types,
     ArrayRef<Value> args, OpBuilder* b) {
-  const Type& sourceType = args.front().getType();
-  const Type& targetType = result_types.front();
+  Type sourceType = args.front().getType();
+  Type targetType = result_types.front();
 
   if (mlir::SIToFPOp::areCastCompatible(sourceType, targetType)) {
     return b->create<mlir::SIToFPOp>(xla_op.getLoc(), result_types, args,
@@ -299,7 +307,7 @@ inline Value MapXlaOpToStdScalarOp<xla_lhlo::ConvertOp>(
     // No conversion is needed for the same width floats
     return args.front();
   }
-  if (sourceType.isa<IntegerType>() && targetType.isa<IntegerType>()) {
+  if (sourceType.isSignlessInteger() && targetType.isSignlessInteger()) {
     IntegerType src = sourceType.cast<IntegerType>();
     IntegerType res = targetType.cast<IntegerType>();
     if (src.getWidth() > res.getWidth()) {
@@ -338,25 +346,58 @@ inline Value MapXlaOpToStdScalarOp<xla_hlo::CosOp>(xla_hlo::CosOp xla_op,
       xla_op.getLoc(), result_types, args, b);
 }
 
+/// Implements the conversion of XLA op to scalar op (to use within region of a
+/// linalg.generic op) for compare-select style operations like min/max.
+template <typename... Args>
+struct MapXlaCompareSelectOpToStdScalarOp {
+  Value operator()(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Value> args,
+                   OpBuilder* b) {
+    return nullptr;
+  }
+};
+
+/// Specialization which allows converting to a comparison operation in standard
+/// dialect with a given predicate based on the element type of the operand.
+template <typename SupportedType, typename StdCompareOp, typename Predicate,
+          typename... Args>
+struct MapXlaCompareSelectOpToStdScalarOp<SupportedType, StdCompareOp,
+                                          Predicate, Args...> {
+  Value operator()(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Value> args,
+                   OpBuilder* b) {
+    Type element_type = args.front().getType();
+    if (element_type.isa<SupportedType>()) {
+      auto predicate = getCmpPredicate<Predicate>(comparison_direction);
+      assert(predicate.hasValue() && "expected valid comparison direction");
+      auto cmp = b->template create<StdCompareOp>(loc, predicate.getValue(),
+                                                  args[0], args[1]);
+      return b->create<::mlir::SelectOp>(loc, cmp, args[0], args[1]);
+    }
+    return MapXlaCompareSelectOpToStdScalarOp<Args...>{}(
+        loc, comparison_direction, result_types, args, b);
+  }
+};
+
 template <>
 inline Value MapXlaOpToStdScalarOp<xla_lhlo::MaxOp>(xla_lhlo::MaxOp xla_op,
                                                     ArrayRef<Type> result_types,
                                                     ArrayRef<Value> args,
                                                     OpBuilder* b) {
-  const auto& lhs = args[0];
-  const auto& rhs = args[1];
-  Type element_type = lhs.getType();
-  if (element_type.isa<IntegerType>()) {
-    auto lhs_gt_rhs = b->create<ScalarIOp<xla_lhlo::CompareOp>>(
-        xla_op.getLoc(), CmpIPredicate::sgt, lhs, rhs);
-    return b->create<::mlir::SelectOp>(xla_op.getLoc(), lhs_gt_rhs, lhs, rhs);
-  }
-  if (element_type.isa<FloatType>()) {
-    auto lhs_gt_rhs = b->create<ScalarFOp<xla_lhlo::CompareOp>>(
-        xla_op.getLoc(), CmpFPredicate::OGT, lhs, rhs);
-    return b->create<::mlir::SelectOp>(xla_op.getLoc(), lhs_gt_rhs, lhs, rhs);
-  }
-  return nullptr;
+  return MapXlaCompareSelectOpToStdScalarOp<
+      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
+      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "GT",
+                                                       result_types, args, b);
+}
+template <>
+inline Value MapXlaOpToStdScalarOp<xla_hlo::MaxOp>(xla_hlo::MaxOp xla_op,
+                                                   ArrayRef<Type> result_types,
+                                                   ArrayRef<Value> args,
+                                                   OpBuilder* b) {
+  return MapXlaCompareSelectOpToStdScalarOp<
+      IntegerType, ScalarIOp<xla_hlo::CompareOp>, CmpIPredicate, FloatType,
+      ScalarFOp<xla_hlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "GT",
+                                                      result_types, args, b);
 }
 
 template <>
@@ -364,20 +405,20 @@ inline Value MapXlaOpToStdScalarOp<xla_lhlo::MinOp>(xla_lhlo::MinOp xla_op,
                                                     ArrayRef<Type> result_types,
                                                     ArrayRef<Value> args,
                                                     OpBuilder* b) {
-  const auto& lhs = args[0];
-  const auto& rhs = args[1];
-  Type element_type = lhs.getType();
-  if (element_type.isa<IntegerType>()) {
-    auto lhs_lt_rhs = b->create<ScalarIOp<xla_lhlo::CompareOp>>(
-        xla_op.getLoc(), CmpIPredicate::slt, lhs, rhs);
-    return b->create<::mlir::SelectOp>(xla_op.getLoc(), lhs_lt_rhs, lhs, rhs);
-  }
-  if (element_type.isa<FloatType>()) {
-    auto lhs_lt_rhs = b->create<ScalarFOp<xla_lhlo::CompareOp>>(
-        xla_op.getLoc(), CmpFPredicate::OLT, lhs, rhs);
-    return b->create<::mlir::SelectOp>(xla_op.getLoc(), lhs_lt_rhs, lhs, rhs);
-  }
-  return nullptr;
+  return MapXlaCompareSelectOpToStdScalarOp<
+      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
+      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "LT",
+                                                       result_types, args, b);
+}
+template <>
+inline Value MapXlaOpToStdScalarOp<xla_hlo::MinOp>(xla_hlo::MinOp xla_op,
+                                                   ArrayRef<Type> result_types,
+                                                   ArrayRef<Value> args,
+                                                   OpBuilder* b) {
+  return MapXlaCompareSelectOpToStdScalarOp<
+      IntegerType, ScalarIOp<xla_hlo::CompareOp>, CmpIPredicate, FloatType,
+      ScalarFOp<xla_hlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "LT",
+                                                      result_types, args, b);
 }
 
 template <>
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index fbaab534565..4c20a589ce0 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <numeric>
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
@@ -164,7 +164,6 @@ std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
           loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
       DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
       DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-      sml_dim.dump();
       CmpIOp compare =
           rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
       index_value =
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc
index 596b67f0eed..644fffcc7ea 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 78ba93f4463..cb7f93edc25 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -41,6 +41,10 @@ void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
 void populateHLOToLHLOConversionPattern(MLIRContext *context,
                                         OwningRewritePatternList *patterns);
 
+// Collection of rewrite patterns for lowering of HLO to Linalg dialect.
+void populateHLOToLinalgConversionPattern(MLIRContext *context,
+                                          OwningRewritePatternList *patterns);
+
 // Sets up legality definitions for materializing broadcasts.
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                         ConversionTarget *conversionTarget);
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index d07819284e5..a0ab1ca2dbe 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // TF:llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/AffineExpr.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -33,17 +33,39 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 
 namespace mlir {
 namespace {
 
-ArrayAttr GetNParallelLoopsAttrs(unsigned nParallelLoops, Builder b) {
-  auto parallelLoopTypeAttr = b.getStringAttr("parallel");
+ArrayAttr GetNParallelLoopsAttrs(unsigned nParallelLoops, Builder* b) {
+  auto parallelLoopTypeAttr = b->getStringAttr("parallel");
   SmallVector<Attribute, 3> iteratorTypes;
   for (int i = 0; i < nParallelLoops; ++i) {
     iteratorTypes.push_back(parallelLoopTypeAttr);
   }
-  return b.getArrayAttr(iteratorTypes);
+  return b->getArrayAttr(iteratorTypes);
+}
+
+template <bool isLHLO = true>
+ShapedType getXLAOpResultType(Operation* op) {
+  if (isLHLO) {
+    return op->getOperand(op->getNumOperands() - 1)
+        .getType()
+        .cast<ShapedType>();
+  }
+  return op->getResult(0).getType().cast<ShapedType>();
+}
+
+template <bool isLHLO = true>
+bool verifyXLAOpBufferOrTensorSemantics(Operation* op) {
+  auto verifyType = [&](Value val) -> bool {
+    return (isLHLO && val.getType().isa<MemRefType>()) ||
+           (!isLHLO && val.getType().isa<RankedTensorType>());
+  };
+  if (!llvm::all_of(op->getOperands(), verifyType)) return false;
+  return isLHLO ? op->getResults().empty()
+                : llvm::all_of(op->getResults(), verifyType);
 }
 
 template <typename OpTy, bool isLHLO = true>
@@ -61,7 +83,7 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       emitError(loc, "lhlo to linalg conversion expects ranked args");
       return ConversionPattern::matchFailure();
     }
-    if (!argType.getElementType().isIntOrFloat()) {
+    if (!argType.getElementType().isSignlessIntOrFloat()) {
       return ConversionPattern::matchFailure();
     }
 
@@ -110,7 +132,7 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
         rewriter.getI64IntegerAttr(bodyArgTypes.size()),     // args_in
         rewriter.getI64IntegerAttr(bodyResultTypes.size()),  // args_out
         rewriter.getArrayAttr(indexingMaps),
-        GetNParallelLoopsAttrs(nloops, rewriter),
+        GetNParallelLoopsAttrs(nloops, &rewriter),
         /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
 
     // Add a block to the region.
@@ -149,7 +171,7 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
     auto loc = lhlo_op.getLoc();
     auto argType =
         lhlo_op.getOperand(0).getType().template dyn_cast<ShapedType>();
-    if (!argType || !argType.getElementType().isIntOrFloat() ||
+    if (!argType || !argType.getElementType().isSignlessIntOrFloat() ||
         (argType.getRank() != 0)) {
       return ConversionPattern::matchFailure();
     }
@@ -167,108 +189,187 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
   }
 };
 
-class BroadcastInDimConverter
-    : public OpConversionPattern<xla_lhlo::BroadcastInDimOp> {
+/// Base class for lowering xla operations that have one operand and one result,
+/// and are semantically equivalent to a copy of the input to the output (like
+/// transpose, some reshape, etc.). The derived classes need to provide a method
+/// `getIndexingMapsAttr` that returns an ArrayAttr containing AffineMapAttr for
+/// the index maps of the input and the output.
+template <typename Derived, typename OpTy, bool isLHLO = true>
+class DataMovementOpConverter : public OpConversionPattern<OpTy> {
  public:
-  using OpConversionPattern<xla_lhlo::BroadcastInDimOp>::OpConversionPattern;
+  using OpConversionPattern<OpTy>::OpConversionPattern;
 
   PatternMatchResult matchAndRewrite(
-      xla_lhlo::BroadcastInDimOp broadcastOp, ArrayRef<Value> args,
+      OpTy op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
-    auto operandMemrefType =
-        broadcastOp.operand().getType().dyn_cast<MemRefType>();
-    auto resultMemrefType =
-        broadcastOp.output().getType().dyn_cast<MemRefType>();
-    if (!operandMemrefType || !resultMemrefType) return matchFailure();
-    auto broadcastDims = broadcastOp.broadcast_dimensions();
-    if (!broadcastDims.hasValue()) return matchFailure();
+    if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op))
+      return ConversionPattern::matchFailure();
+    auto operandType = op.operand().getType().template cast<ShapedType>();
+    auto resultType = getXLAOpResultType<isLHLO>(op);
+    if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op))
+      return ConversionPattern::matchFailure();
+    ArrayAttr indexingMapsAttr =
+        static_cast<const Derived&>(*this).getIndexingMapsAttr(op, &rewriter);
+    if (!indexingMapsAttr) return ConversionPattern::matchFailure();
 
-    return broadcastDims.getValue().getIntValues().empty()
-               ? emitScalarBroadcast(broadcastOp, args, resultMemrefType,
-                                     &rewriter)
-               : emitNonScalarBroadcast(broadcastOp, args, operandMemrefType,
-                                        resultMemrefType, &rewriter);
-  }
-
- private:
-  PatternMatchResult emitScalarBroadcast(
-      xla_lhlo::BroadcastInDimOp broadcastOp, ArrayRef<Value> args,
-      MemRefType resultMemrefType, ConversionPatternRewriter* rewriter) const {
-    unsigned nloops = resultMemrefType.getRank();
-    SmallVector<Attribute, 1> indexingMaps{
-        AffineMapAttr::get(rewriter->getMultiDimIdentityMap(nloops))};
-    auto loc = broadcastOp.getLoc();
-    auto linalgOp = rewriter->create<linalg::GenericOp>(
-        loc, ArrayRef<Type>{}, broadcastOp.output(),
-        rewriter->getI64IntegerAttr(0),  // args_in
-        rewriter->getI64IntegerAttr(1),  // args_out
-        rewriter->getArrayAttr(indexingMaps),
-        GetNParallelLoopsAttrs(nloops, *rewriter),
+    OpBuilder::InsertionGuard linalgOpGuard(rewriter);
+    auto nloops = resultType.getRank();
+    auto loc = op.getLoc();
+    auto linalgOp = rewriter.create<linalg::GenericOp>(
+        loc, isLHLO ? ArrayRef<Type>{} : resultType, args,
+        rewriter.getI64IntegerAttr(1), rewriter.getI64IntegerAttr(1),
+        indexingMapsAttr, GetNParallelLoopsAttrs(nloops, &rewriter),
         /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
 
-    // Add a block to the region.
     auto* region = &linalgOp.region();
-    auto* block = rewriter->createBlock(region, region->end());
-    block->addArguments(resultMemrefType.getElementType());
+    auto* block = rewriter.createBlock(region, region->end());
+    block->addArguments(operandType.getElementType());
+    if (isLHLO) block->addArgument(resultType.getElementType());
 
-    rewriter->setInsertionPointToEnd(block);
-    auto scalar =
-        rewriter->create<LoadOp>(loc, broadcastOp.operand(), llvm::None);
-    rewriter->create<linalg::YieldOp>(loc, scalar.getResult());
-    rewriter->eraseOp(broadcastOp);
-    return matchSuccess();
+    rewriter.setInsertionPointToEnd(block);
+    rewriter.create<linalg::YieldOp>(loc, block->getArgument(0));
+
+    rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
+    return ConversionPattern::matchSuccess();
   }
+};
 
-  PatternMatchResult emitNonScalarBroadcast(
-      xla_lhlo::BroadcastInDimOp broadcastOp, ArrayRef<Value> args,
-      MemRefType operandMemrefType, MemRefType resultMemrefType,
-      ConversionPatternRewriter* rewriter) const {
-    SmallVector<Type, 4> bodyArgTypes{operandMemrefType.getElementType()};
+template <typename OpTy, bool isLHLO = true>
+class BroadcastInDimConverter
+    : public DataMovementOpConverter<BroadcastInDimConverter<OpTy, isLHLO>,
+                                     OpTy, isLHLO> {
+ public:
+  using DataMovementOpConverter<BroadcastInDimConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
 
-    unsigned nloops = resultMemrefType.getRank();
+  ArrayAttr getIndexingMapsAttr(OpTy broadcastOp, Builder* b) const {
+    auto resultType = getXLAOpResultType<isLHLO>(broadcastOp);
+    auto operandType =
+        broadcastOp.operand().getType().template cast<ShapedType>();
+    unsigned nloops = resultType.getRank();
 
-    auto operandShape = operandMemrefType.getShape();
+    auto operandShape = operandType.getShape();
     SmallVector<AffineExpr, 4> dimExprs;
     {
       dimExprs.reserve(nloops);
-      for (const auto& broadcastDim : llvm::enumerate(
-               broadcastOp.broadcast_dimensions().getValue().getIntValues())) {
-        int dim = broadcastDim.value().getSExtValue();
 
-        // TODO(pifon): Add support for args with dynamic shapes for the case
-        // when a dimension of size 1 is broadcasted into dim of size N.
-        AffineExpr affineExpr =
-            operandShape[broadcastDim.index()] == 1
-                ? mlir::getAffineConstantExpr(0, broadcastOp.getContext())
-                : mlir::getAffineDimExpr(dim, broadcastOp.getContext());
-        dimExprs.push_back(affineExpr);
+      if (broadcastOp.broadcast_dimensions()) {
+        for (const auto& broadcastDim :
+             enumerate(broadcastOp.broadcast_dimensions()
+                           .getValue()
+                           .getIntValues())) {
+          int size = broadcastDim.value().getSExtValue();
+          // TODO(pifon): Add support for args with dynamic shapes for the case
+          // when a dimension of size 1 is broadcasted into dim of size N.
+          AffineExpr affineExpr = operandShape[broadcastDim.index()] == 1
+                                      ? b->getAffineConstantExpr(0)
+                                      : b->getAffineDimExpr(size);
+          dimExprs.push_back(affineExpr);
+        }
+      }
+      if (dimExprs.empty()) {
+        // The input is a scalar, i.e. this is a scalar broadcast op.
+        dimExprs.push_back(b->getAffineConstantExpr(0));
       }
     }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, dimExprs),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
 
-    // Construct the indexing maps needed for linalg.generic ops.
-    SmallVector<Attribute, 2> indexingMaps{
-        AffineMapAttr::get(AffineMap::get(nloops, /*symbolCount=*/0, dimExprs)),
-        AffineMapAttr::get(rewriter->getMultiDimIdentityMap(nloops))};
+template <typename OpTy, bool isLHLO = true>
+class TransposeConverter
+    : public DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+  ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) const {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    inputExprs.resize(resultType.getRank());
+    for (auto permutation : llvm::enumerate(op.permutation())) {
+      inputExprs[permutation.value().getZExtValue()] =
+          b->getAffineDimExpr(permutation.index());
+    }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
 
-    auto loc = broadcastOp.getLoc();
-    auto linalgOp = rewriter->create<linalg::GenericOp>(
-        loc, ArrayRef<Type>{}, args,
-        rewriter->getI64IntegerAttr(bodyArgTypes.size()),  // args_in
-        rewriter->getI64IntegerAttr(1),                    // args_out
-        rewriter->getArrayAttr(indexingMaps),
-        GetNParallelLoopsAttrs(nloops, *rewriter),
-        /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
+/// Pattern for the special case where reshape is adding or removing a dimension
+/// of size 1. These can be lowered to a linalg.generic op.
+///
+/// For example a
+///   "xla_hlo.reshape"(..) : (tensor<12x1x42xi32) -> tensor<12x42xi32>
+/// can have indexing maps
+/// [affine_map<(d0, d1) -> (d0, 0, d1)>, affine_map<(d0, d1) -> (d0, d1)>]
+///
+/// Similarly a
+///   "xla_hlo.reshape"(..) : (tensor<12x42xi32>) -> tensor<12x1x42xi32>
+/// can have indexing maps
+/// [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1,
+/// d2)>]
+template <typename OpTy, bool isLHLO = true>
+class ReshapeAddRemoveDimConverter
+    : public DataMovementOpConverter<ReshapeAddRemoveDimConverter<OpTy, isLHLO>,
+                                     OpTy, isLHLO> {
+ public:
+  using DataMovementOpConverter<ReshapeAddRemoveDimConverter<OpTy, isLHLO>,
+                                OpTy, isLHLO>::DataMovementOpConverter;
 
-    // Add a block to the region.
-    auto* region = &linalgOp.region();
-    auto* block = rewriter->createBlock(region, region->end());
-    block->addArguments(bodyArgTypes);
-    block->addArguments(resultMemrefType.getElementType());
+  ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) const {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto operandType =
+        op.getOperation()->getOperand(0).getType().template cast<ShapedType>();
+    if (!resultType.hasStaticShape() || !operandType.hasStaticShape())
+      return nullptr;
 
-    rewriter->setInsertionPointToEnd(block);
-    rewriter->create<linalg::YieldOp>(loc, block->getArgument(0));
-    rewriter->eraseOp(broadcastOp);
-    return matchSuccess();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    unsigned resultIndex = 0, operandIndex = 0;
+    auto resultShape = resultType.getShape();
+    auto operandShape = operandType.getShape();
+
+    while (resultIndex < resultShape.size() &&
+           operandIndex < operandShape.size()) {
+      if (resultShape[resultIndex] == operandShape[operandIndex]) {
+        // Copy over the affine expr when the size of the result and operand
+        // match at a dim
+        inputExprs.push_back(b->getAffineDimExpr(resultIndex));
+        resultIndex++;
+        operandIndex++;
+      } else if (resultShape[resultIndex] == 1) {
+        // If size at result is 1, then ignore this dimension for the input, it
+        // is an extra dim added.
+        resultIndex++;
+      } else if (operandShape[operandIndex] == 1) {
+        // If the operandShape is 1, then add a (0) for the operand map since
+        // this dimension is dropped.
+        inputExprs.push_back(b->getAffineConstantExpr(0));
+        operandIndex++;
+      } else {
+        return nullptr;
+      }
+    }
+    // Make sure all remaining dimensions of the operand and result are ones.
+    auto checkRemainingDims = [](int64_t dim) { return dim != 1; };
+    if ((resultIndex < resultShape.size() &&
+         llvm::any_of(resultShape.drop_front(resultIndex),
+                      checkRemainingDims)) ||
+        (operandIndex < operandShape.size() &&
+         llvm::any_of(operandShape.drop_front(operandIndex),
+                      checkRemainingDims)))
+      return nullptr;
+    inputExprs.resize(operandShape.size(), b->getAffineConstantExpr(0));
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs),
+         b->getMultiDimIdentityMap(nloops)});
   }
 };
 
@@ -284,7 +385,7 @@ class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
     if (!resultMemrefType) return matchFailure();
 
     auto resultElementType = resultMemrefType.getElementType();
-    if (!resultElementType.isIntOrFloat()) return matchFailure();
+    if (!resultElementType.isSignlessIntOrFloat()) return matchFailure();
 
     // Construct the indexing maps needed for linalg.generic ops.
     unsigned nloops = resultMemrefType.getRank();
@@ -298,7 +399,7 @@ class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
         rewriter.getI64IntegerAttr(0),  // args_in
         rewriter.getI64IntegerAttr(1),  // args_out
         rewriter.getArrayAttr(indexingMaps),
-        GetNParallelLoopsAttrs(nloops, rewriter),
+        GetNParallelLoopsAttrs(nloops, &rewriter),
         /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
 
     // Add a block to the region.
@@ -378,7 +479,7 @@ class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
 void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                                            OwningRewritePatternList* patterns) {
   // clang-format off
-  patterns->insert<BroadcastInDimConverter,
+  patterns->insert<BroadcastInDimConverter<xla_lhlo::BroadcastInDimOp>,
                    ConstConverter,
                    IotaConverter,
                    PointwiseToLinalgConverter<xla_lhlo::AbsOp>,
@@ -400,29 +501,13 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::SignOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                    PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
+                   ReshapeAddRemoveDimConverter<xla_lhlo::ReshapeOp>,
                    ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                    SliceConverter
                   >(context);
   // clang-format on
 }
 
-void populateHLOToLinalgConversionPattern(MLIRContext* context,
-                                          OwningRewritePatternList* patterns) {
-  patterns->insert<PointwiseToLinalgConverter<xla_hlo::AbsOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::AddOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::AndOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::CeilOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::CompareOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::SubOp, false>,
-                   PointwiseToLinalgConverter<xla_hlo::TanhOp, false>>(context);
-}
-
 // Converts LHLO ops to Linalg generic.
 // Sample result for xla_lhlo::AddOp.
 //
@@ -464,7 +549,7 @@ struct HloLegalizeToLinalg : public FunctionPass<HloLegalizeToLinalg> {
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect>();
 
     auto func = getFunction();
-    populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
+    xla_hlo::populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
     if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
       signalPassFailure();
     }
@@ -483,6 +568,30 @@ static PassRegistration<LhloLegalizeToLinalg> legalize_lhlo_pass(
 }  // namespace xla_lhlo
 
 namespace xla_hlo {
+
+void populateHLOToLinalgConversionPattern(MLIRContext* context,
+                                          OwningRewritePatternList* patterns) {
+  patterns->insert<BroadcastInDimConverter<xla_hlo::BroadcastInDimOp, false>,
+                   ReshapeAddRemoveDimConverter<xla_hlo::ReshapeOp, false>,
+                   TransposeConverter<xla_hlo::TransposeOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::AbsOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::AddOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::AndOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::CeilOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::CompareOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::DivOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::MaxOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::MinOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::SubOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::TanhOp, false>>(context);
+}
+
 std::unique_ptr<OpPassBase<FuncOp>> createLegalizeHloToLinalgPass() {
   return absl::make_unique<HloLegalizeToLinalg>();
 }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 203ef51c842..d517b5d0bdd 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -991,6 +991,11 @@ tf_xla_py_test(
     name = "unstack_test",
     size = "medium",
     srcs = ["unstack_test.py"],
+    disabled_backends = [
+        # TODO(b/149750262): timeout on CPU.
+        "cpu",
+        "cpu_ondemand",
+    ],
     python_version = "PY3",
     shard_count = 5,
     tags = [
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index a03980f20ba..0ed81b7e9e5 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -695,7 +695,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
       wholly_compiled_f = def_function.function(f)
       op_by_op_f = def_function.function(f, experimental_compile=False)
 
-      x = constant_op.constant([0.0, 2.0], name='data')
+      x = array_ops.identity([0.0, 2.0], name='data')
 
       # When function is wholly compiled, all outputs will be on the
       # device on which it is run.
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index b815b6d1b1f..b26b509b067 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -189,6 +189,8 @@ tf_cuda_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:array",
     ] + if_tensorrt([
         "@local_config_cuda//cuda:cuda_headers",
     ]),
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 063a6667ed1..6f276546451 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -469,6 +469,7 @@ Status CreateTRTNode(const ConversionParams& params,
           .Attr("precision_mode", prec_string)
           .Attr("use_calibration", info.use_calibration)
           .Attr("_use_implicit_batch", params.use_implicit_batch)
+          .Attr("_allow_build_at_runtime", info.allow_build_at_runtime)
           .Attr("OutT", out_types)
           .Finalize(&trt_node);
   if (!status.ok()) {
@@ -672,6 +673,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
                                    : EngineInfo::EngineType::TRTStatic);
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
+    curr_engine.allow_build_at_runtime = params.allow_build_at_runtime;
 
     status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def,
                                             &graph, curr_engine.engine_name);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 00dc4c72f43..2bfaa2a786c 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -49,6 +49,7 @@ struct ConversionParams {
   int max_cached_engines = 1;
   bool use_calibration = true;
   bool use_implicit_batch = true;
+  bool allow_build_at_runtime = true;
 };
 
 // Method to call from optimization pass
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 433564513db..8083a55466a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
 #if GOOGLE_CUDA
@@ -2225,6 +2226,21 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   return Status::OK();
 }
 
+bool AllowInefficientTranspose() {
+  static bool result = [] {
+    bool value;
+    Status status =
+        ReadBoolFromEnvVar("TF_DEBUG_TRT_ALLOW_INEFFICIENT_TRANSPOSE",
+                           /*default_value=*/false, &value);
+    if (!status.ok()) {
+      LOG(ERROR) << status;
+    }
+    return value;
+  }();
+
+  return result;
+}
+
 Status ConvertTranspose(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   TF_RETURN_IF_ERROR(
@@ -2251,7 +2267,7 @@ Status ConvertTranspose(OpConverterParams* params) {
   // So check tensor size, and don't convert if it is too large.
   constexpr int64_t kMaxEfficientTranspose = 2500000;
   int64_t tensor_size = TrtTensorDimsNumElements(input_tensor->getDimensions());
-  if (tensor_size > kMaxEfficientTranspose) {
+  if (!AllowInefficientTranspose() && tensor_size > kMaxEfficientTranspose) {
     return errors::Unimplemented(StrCat("Transpose too large:", tensor_size));
   }
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 3f65b1a9818..8608c8226ee 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -93,7 +93,8 @@ struct EngineInfo {
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
         precision_mode(TrtPrecisionMode::FP32),
-        use_calibration(true) {}
+        use_calibration(true),
+        allow_build_at_runtime(true) {}
 
   string engine_name;
   string device;
@@ -110,6 +111,7 @@ struct EngineInfo {
   int maximum_cached_engines;
   TrtPrecisionMode precision_mode;
   bool use_calibration;
+  bool allow_build_at_runtime;
 };
 
 // Constructs a graphdef from the segment in the given graph. Adds _Arg
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 757ddd159c9..7995163ed44 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -70,6 +70,9 @@ Status TRTOptimizationPass::Init(
   if (params.count("trt_logger")) {
     trt_logger_name_ = params.at("trt_logger").s();
   }
+  if (params.count("allow_build_at_runtime")) {
+    allow_build_at_runtime_ = params.at("allow_build_at_runtime").b();
+  }
   if (params.count("use_implicit_batch")) {
     use_implicit_batch_ = params.at("use_implicit_batch").b();
   }
@@ -265,6 +268,7 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
   cp.max_cached_engines = max_cached_batches_;
   cp.use_calibration = use_calibration_;
   cp.use_implicit_batch = use_implicit_batch_;
+  cp.allow_build_at_runtime = allow_build_at_runtime_;
   auto status = ConvertAfterShapes(cp);
   VLOG(1) << "Returning from " << name_;
   return status;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index 3ce0d09b7c0..f79048bb5f6 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -42,7 +42,8 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
         max_cached_batches_(1),
         max_workspace_size_bytes_(256LL << 20),
         use_calibration_(true),
-        use_implicit_batch_(true) {
+        use_implicit_batch_(true),
+        allow_build_at_runtime_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
@@ -75,6 +76,7 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
   int64_t max_workspace_size_bytes_;
   bool use_calibration_;
   bool use_implicit_batch_;
+  bool allow_build_at_runtime_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index 2fb8902883e..c83b84998fa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -134,6 +134,35 @@ string DebugString(const std::vector<PartialTensorShape>& shapes) {
   return PartialTensorShapeUtils::PartialShapeListString(shapes);
 }
 
+// Checks whether actual_shapes are compatible with cached_shapes. This should
+// only be used in implicit batch mode (in explicit batch mode one needs to
+// check the profile ranges). Therefore implicit batch mode is assumed.
+// It is also assumed that both actual_shapes and cached_shapes have been
+// verified by TRTEngineOp::VerifyInputShapes, which ensures that the batch size
+// for all tensors are the same.
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes) {
+  auto match_shape = [](const TensorShape& actual_shape,
+                        const TensorShape& cached_shape) {
+    // Match the rank.
+    if (actual_shape.dims() != cached_shape.dims()) return false;
+    // Match the batch size. In implicit batch mode cached_shape.dim_size(0) is
+    // the max batch size, which can be larger than the actual batch size.
+    if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false;
+    // Match remaining dimensions.
+    for (int i = 1; i < actual_shape.dims(); ++i) {
+      if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false;
+    }
+    return true;
+  };
+  for (int i = 0; i < actual_shapes.size(); ++i) {
+    if (!match_shape(actual_shapes[i], cached_shapes[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
 int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
   int n_bindings = engine->getNbBindings();
   int n_input = 0;
@@ -152,6 +181,7 @@ int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
 #endif
   return n_input / n_profiles;
 }
+
 #endif
 
 string GetLinkedTensorRTVersion() {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 668620bb90a..139984616f0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -98,14 +98,18 @@ inline nvinfer1::Dims TensorShapeToTrtDims(const TensorShapeType& shape,
   return trt_dims;
 }
 
-// Return a string that includes compile time
-// TensorRT library version information {Maj, Min, Patch}.
+// Returns a string that includes compile time TensorRT library version
+// information {Maj, Min, Patch}.
 string GetLinkedTensorRTVersion();
 
-// Return a string that includes runtime time
-// TensorRT library version information {Maj, Min, Patch}.
+// Returns a string that includes runtime time TensorRT library version
+// information {Maj, Min, Patch}.
 string GetLoadedTensorRTVersion();
 
+// Returns true if an engine built for cached_shapes can also run actual_shapes.
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes);
+
 // Returns the number of inputs for the engine, which also correspends to the
 // number of input tensors for the network. This can differ from the number of
 // input bindings, because the number of total input bindings equals the number
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index cdb1207ba6c..ad6eed3fb50 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -119,21 +119,15 @@ class TRTEngineOp : public AsyncOpKernel {
   Status GetEngineCacheResource(OpKernelContext* ctx,
                                 TRTEngineCacheResource** cache_res);
 
-  // Get engine for the input shape
-  StatusOr<EngineContext*> GetEngine(
-      const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,
-      TRTEngineCacheResource* cache_res);
+  // Returns a pair of 1) An EngineContext object that is compatible with the
+  // input and 2) The index of the IExecutionContext compatible with the input.
+  StatusOr<std::pair<EngineContext*, int>> GetEngine(
+      const std::vector<TensorShape>& input_concrete_shapes,
+      OpKernelContext* ctx, TRTEngineCacheResource* cache_res);
 
   // Verify that the input shapes are consistent and can be handled by this op.
   Status VerifyInputShapes(const std::vector<TensorShape>& shapes);
 
-  // Return engine batch in cached_engine_batch_sizes_ which is closest to input
-  // batch.
-  Status GetEngineInputShapes(
-      const CacheType& cache,
-      const std::vector<TensorShape>& actual_input_shapes,
-      std::vector<TensorShape>* engine_input_shapes);
-
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
 
@@ -156,10 +150,13 @@ class TRTEngineOp : public AsyncOpKernel {
   // Whether to calibrate INT8 engine.
   bool calibration_mode_;
 
-  // Whether to use implicit batch dimension for TensorRT
+  // Whether to use implicit batch dimension for TensorRT.
   bool use_implicit_batch_;
 
-  // Maximum number of cached engines
+  // Whether to build TensorRT engines at runtime.
+  bool allow_build_at_runtime_;
+
+  // Maximum number of cached engines.
   int max_cached_engines_;
 
   int64 workspace_size_;
@@ -283,6 +280,14 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
                  context->GetAttr("use_calibration", &use_calibration_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("input_shapes", &input_partial_shapes_));
+  auto status =
+      context->GetAttr("_allow_build_at_runtime", &allow_build_at_runtime_);
+  if (status.code() == tensorflow::error::NOT_FOUND) {
+    VLOG(2) << "Not found _allow_build_at_runtime in "
+            << context->device()->name()
+            << ", thus setting _allow_build_at_runtime=true";
+    allow_build_at_runtime_ = true;
+  }
   func_handle_ = kInvalidHandle;
   if (!static_engine_) {
     FunctionLibraryRuntime* lib = context->function_library();
@@ -304,7 +309,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
 
-  auto status = context->GetAttr("_use_implicit_batch", &use_implicit_batch_);
+  status = context->GetAttr("_use_implicit_batch", &use_implicit_batch_);
   if (status.code() == tensorflow::error::NOT_FOUND) {
     VLOG(2) << "Not found _use_implicit_batch in " << context->device()->name()
             << ", thus setting _use_implicit_batch=true";
@@ -490,61 +495,6 @@ Status TRTEngineOp::VerifyInputShapes(
   return Status::OK();
 }
 
-bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
-                         const std::vector<TensorShape>& cached_shapes) {
-  auto match_shape = [](const TensorShape& actual_shape,
-                        const TensorShape& cached_shape) {
-    // Match the rank.
-    if (actual_shape.dims() != cached_shape.dims()) return false;
-    // Match the batch size.
-    if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false;
-    // Match remaining dimensions.
-    for (int i = 1; i < actual_shape.dims(); ++i) {
-      if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false;
-    }
-    return true;
-  };
-  for (int i = 0; i < actual_shapes.size(); ++i) {
-    if (!match_shape(actual_shapes[i], cached_shapes[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// This routine finds the engines with input shapes compatible with the
-// actual_input_shapes, and returns the input shapes of one of such engine that
-// has the smallest batch size.
-Status TRTEngineOp::GetEngineInputShapes(
-    const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,
-    std::vector<TensorShape>* engine_input_shapes) {
-  // VerifyInputShapes() already ensured that all input shapes have same
-  // batch size, and are not scalars, if we are in implicit batch mode.
-  //
-  // In explicit batch mode we plan to have single engine in the cache, and we
-  // return its shape if it is compatible.
-  *engine_input_shapes = actual_input_shapes;
-  int64 min_matched_batch_size = kint64max;
-  for (const auto& pair : cache) {
-    const std::vector<TensorShape>& cached_input_shapes = pair.first;
-    // This should not happen, but just for safety.
-    if (actual_input_shapes.size() != cached_input_shapes.size()) {
-      return errors::InvalidArgument(
-          "Input shape list size mismatch for ", name(),
-          ", cached size: ", cached_input_shapes.size(),
-          " vs. actual size: ", actual_input_shapes.size());
-    }
-    if (AreShapesCompatible(actual_input_shapes, cached_input_shapes)) {
-      const int cached_batch_size = cached_input_shapes[0].dim_size(0);
-      if (min_matched_batch_size > cached_batch_size) {
-        min_matched_batch_size = cached_batch_size;
-        *engine_input_shapes = cached_input_shapes;
-      }
-    }
-  }
-  return Status::OK();
-}
-
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
                                AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
@@ -605,15 +555,12 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
       cache_res->profiles_.InitProfiles();
     }
   }
-  StatusOr<EngineContext*> status =
+  StatusOr<std::pair<EngineContext*, int>> status =
       GetEngine(input_concrete_shapes, ctx, cache_res);
   OP_REQUIRES_OK_ASYNC(ctx, status.status(), *helper);
 
-  EngineContext* engine_context = status.ValueOrDie();
-  // Context idx equals with the profile idx because for each profile we create
-  // one context. Currently we do not have profile_generation mode, therefore we
-  // have just a single profile.
-  int trt_context_idx = 0;
+  EngineContext* engine_context = status.ValueOrDie().first;
+  int trt_context_idx = status.ValueOrDie().second;
   if (!engine_context->cuda_engine) {
     VLOG(1) << "Engine retrieval for input shapes: "
             << TensorShapeUtils::ShapeListString(input_concrete_shapes)
@@ -693,17 +640,20 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
     VLOG(2) << binding_types;
   }
 
-  const bool kRetry = true;
-  if (trt_context_idx >= 1) {
-    LOG(ERROR) << "Requested engine context with index " << trt_context_idx
-               << ", but only 1 context is present.";
-    return kRetry;
-  }
   const int num_binding = cuda_engine->getNbBindings();
   std::vector<void*> buffers(num_binding);
 
   mutex_lock lock(engine_context->mu);
-  auto& execution_context = engine_context->execution_context;
+  nvinfer1::IExecutionContext* execution_context;
+  Status status =
+      engine_context->GetExecutionContext(trt_context_idx, &execution_context);
+  const bool kRetry = true;
+  if (!status.ok()) {
+    // TODO(Tamas) let ExecuteTrtEngine return a status, and do the logging at
+    // the call site
+    LOG(ERROR) << status;
+    return kRetry;
+  }
 
   // Setup engine inputs.
   for (int i = 0; i < ctx->num_inputs(); i++) {
@@ -905,7 +855,7 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
       }});
 }
 
-StatusOr<EngineContext*> TRTEngineOp::GetEngine(
+StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
     const std::vector<TensorShape>& input_concrete_shapes, OpKernelContext* ctx,
     TRTEngineCacheResource* cache_res) {
   static EngineContext empty_context;
@@ -920,7 +870,7 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
   auto& cache = cache_res->cache_;
   auto allocator = cache_res->allocator_.get();
   if (allocator == nullptr) {
-    return &empty_context;
+    return std::pair<EngineContext*, int>(&empty_context, 0);
   }
 
   // Handle the static engine case. For static engines, the cache will have a
@@ -931,9 +881,9 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
       // implicit batch is disabled.
       if (!use_implicit_batch_ ||
           AreShapesCompatible(input_concrete_shapes, cache.begin()->first)) {
-        return cache.begin()->second.get();
+        return std::pair<EngineContext*, int>(cache.begin()->second.get(), 0);
       }
-      return &empty_context;
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
 
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
@@ -942,7 +892,7 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
                                      serialized_segment_.size(), nullptr));
     if (!static_engine) {
-      return &empty_context;
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
     auto raw_static_engine = static_engine.get();
     const auto max_batch_size = raw_static_engine->getMaxBatchSize();
@@ -966,19 +916,41 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
     // Swap with temporary empty string to deallocate the CPU memory.
     serialized_segment_.swap(tmp);
     if (use_implicit_batch_ && (max_batch_size < batch_size)) {
-      return &empty_context;
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
-    return cache.at(engine_input_shapes).get();
+    return std::pair<EngineContext*, int>(cache.at(engine_input_shapes).get(),
+                                          0);
   }  // static_engine_
 
-  // Handle the dynamic engine case. See if there is a compatible engine cached.
-  std::vector<TensorShape> engine_input_shapes;
-  TF_RETURN_IF_ERROR(
-      GetEngineInputShapes(cache, input_concrete_shapes, &engine_input_shapes));
+  int profile_id = -1;
+  if (!use_implicit_batch_) {
+    profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes);
+    // Since all profiles are already created at this point, finding no
+    // compatible profiles results in falling back to native TF.
+    if (profile_id == -1) {
+      return std::pair<EngineContext*, int>(&empty_context, 0);
+    }
+  }
 
-  // If matched, use that engine. Otherwise, we will look in cache for that
-  // exact shape and possibly create a new engine if it is not in cache.
-  if (!cache.count(engine_input_shapes)) {
+  EngineContext* engine_contexts;
+  if (use_implicit_batch_) {
+    engine_contexts = cache_res->GetEngineContext(input_concrete_shapes);
+  } else {
+    engine_contexts = cache_res->GetEngineContext(profile_id);
+  }
+
+  // If cache does not have a compatible engine then create a new engine.
+  if (engine_contexts == nullptr) {
+    if (!allow_build_at_runtime_) {
+      LOG(WARNING) << "Found no engine in cache matching input shapes. "
+                   << "Not building a new engine because "
+                   << "allow_build_at_runtime=False. "
+                   << "The native segment will be used instead.";
+      // Store an empty engine in the cache for these input shapes so we don't
+      // try to build the same failing engine again.
+      cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
+      return std::pair<EngineContext*, int>(&empty_context, 0);
+    }
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
     LOG(INFO) << "Building a new TensorRT engine for " << name()
@@ -1007,18 +979,20 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
       // Store an empty engine in the cache for these input shapes so we don't
       // try to build the same failing engine again.
       cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
-      return &empty_context;
+      return std::pair<EngineContext*, int>(&empty_context, 0);
     }
     std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context;
     TF_RETURN_IF_ERROR(cache_res->profiles_.CreateExecutionContexts(
         engine.get(), exec_context));
     cache.emplace(input_concrete_shapes,
                   absl::make_unique<EngineContext>(std::move(engine),
-                                                   std::move(exec_context[0])));
+                                                   std::move(exec_context)));
     VLOG(1) << "Added new engine to cache of " << name()
             << ". Cache size: " << cache.size();
+    engine_contexts = cache.at(input_concrete_shapes).get();
   }
-  return cache.at(engine_input_shapes).get();
+  return std::pair<EngineContext*, int>(engine_contexts,
+                                        use_implicit_batch_ ? 0 : profile_id);
 }
 
 // TODO(hinsu): Move this allocation to CalibrationContext constructor, if
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index 51574dad6b5..da8bd6686a7 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -62,7 +62,8 @@ class TRTEngineOpTestBase : public OpsTestBase {
  public:
   void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1,
                       PartialTensorShape shape = PartialTensorShape({-1, -1}),
-                      bool use_implicit_batch = true) {
+                      bool use_implicit_batch = true,
+                      bool allow_build_at_runtime = true) {
     // Create the GPU device.
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
@@ -104,6 +105,7 @@ class TRTEngineOpTestBase : public OpsTestBase {
                      .Attr("precision_mode", "FP32")
                      .Attr("use_calibration", false)
                      .Attr("_use_implicit_batch", use_implicit_batch)
+                     .Attr("_allow_build_at_runtime", allow_build_at_runtime)
                      .Attr("OutT", {dtype})
                      .Finalize(OpsTestBase::node_def()));
     TF_ASSERT_OK(InitOpWithFunctionLibrary());
@@ -191,6 +193,32 @@ TEST_F(TRTEngineOpTestBase, DynamicEngines) {
   EXPECT_EQ(1, cache->count({TensorShape({10, 10})}));
 }
 
+TEST_F(TRTEngineOpTestBase, AllowBuildAtRuntime) {
+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
+                                      PartialTensorShape({-1, -1}),
+                                      /*use_implicit_batch=*/true,
+                                      /*allow_build_at_runtime=*/false);
+
+  // Execute the op
+  TensorShape input_shape({2, 2});
+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
+
+  // Get the engine cache.
+  TRTEngineCacheResource* cache_resource = nullptr;
+  TF_ASSERT_OK(
+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
+  core::ScopedUnref sc(cache_resource);
+
+  // It should contain a placeholder with an empty cuda_engine (to mark that
+  // engine creation was not successful for the given input shape).
+  auto cache = &cache_resource->cache_;
+  EXPECT_EQ(1, cache->size());
+  ASSERT_EQ(1, cache->count({input_shape}));
+  EngineContext* ectx = cache->at({input_shape}).get();
+  EXPECT_EQ(ectx->cuda_engine, nullptr);
+}
+
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
 TEST_F(TRTEngineOpTestBase, ExplicitBatch) {
   // Test inference in explicit batch mode with static input shapes. Static
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index de7b7381d0c..8ef72ba44d5 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -157,7 +157,7 @@ class InitializeTRTResource : public OpKernel {
       }
       resource->cache_.emplace(engine_input_shapes,
                                absl::make_unique<EngineContext>(
-                                   std::move(engine), std::move(ctx_vec[0])));
+                                   std::move(engine), std::move(ctx_vec)));
       ++num_loaded_engine;
     } while (1);
     VLOG(1) << "Loaded " << num_loaded_engine << " TRT engines for op "
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index 5ab6bf1a317..fbcdaad52c0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -88,12 +88,56 @@ string TRTEngineCacheResource::DebugString() const {
     mutex_lock lock(item.second->mu);
     oss << TensorShapeUtils::ShapeListString(item.first) << ": " << hex
         << "ICudaEngine: " << item.second->cuda_engine.get() << ", "
-        << "IExecutionContext: " << item.second->execution_context.get() << dec
-        << endl;
+        << "IExecutionContext: ";
+    for (auto& ctx : item.second->execution_context) {
+      oss << ctx.get() << ", ";
+    }
+    oss << dec << endl;
   }
   return oss.str();
 }
 
+EngineContext* TRTEngineCacheResource::GetEngineContext(
+    const std::vector<TensorShape>& input_shapes) {
+  EngineContext* engine_context = nullptr;
+  int64 min_matched_batch_size = kint64max;
+  for (const auto& pair : cache_) {
+    const std::vector<TensorShape>& cached_input_shapes = pair.first;
+    // This should not happen, but just for safety.
+    if (input_shapes.size() != cached_input_shapes.size()) {
+      LOG(ERROR) << "Input shape list size mismatch"
+                 << ", cached size: " << cached_input_shapes.size()
+                 << " vs. input size: " << input_shapes.size();
+    }
+    if (AreShapesCompatible(input_shapes, cached_input_shapes)) {
+      const int cached_batch_size = cached_input_shapes[0].dim_size(0);
+      if (min_matched_batch_size > cached_batch_size) {
+        min_matched_batch_size = cached_batch_size;
+        engine_context = pair.second.get();
+      }
+    }
+  }
+  return engine_context;
+}
+
+EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) {
+  if (profile_id >= profiles_.GetNumProfiles()) {
+    LOG(ERROR) << "Out of range: profile_id " << profile_id
+               << " is larger than number of profiles "
+               << profiles_.GetNumProfiles();
+    return nullptr;
+  }
+  if (cache_.size() > 1) {
+    LOG(ERROR) << "Cache is expected to have at most "
+               << "1 engine in explicit batch mode where profiles are used.";
+    return nullptr;
+  }
+  if (cache_.size() == 0) {
+    return nullptr;
+  }
+  return cache_.begin()->second.get();
+}
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index ae54569a726..97995fa186a 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -123,12 +123,35 @@ struct EngineContext {
   EngineContext(
       TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
       TrtUniquePtrType<nvinfer1::IExecutionContext>&& input_execution_context)
+      : cuda_engine(std::move(input_cuda_engine)) {
+    execution_context.push_back(std::move(input_execution_context));
+  }
+  EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
+                std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>&&
+                    input_execution_context)
       : cuda_engine(std::move(input_cuda_engine)),
         execution_context(std::move(input_execution_context)) {}
 
   mutex mu;
   TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
-  TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context
+
+  Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx)
+      EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    if (idx >= execution_context.size()) {
+      return errors::Internal("Requested engine context with index ", idx,
+                              ", but only ", execution_context.size(),
+                              "contexts are present.");
+    }
+    *exec_ctx = execution_context[idx].get();
+    return Status::OK();
+  }
+
+  // In explicit batch mode, we maintain a vector of contexts for each engine,
+  // where each context is created for a different profile. The
+  // IExecutionContext object is not thread safe: only one thread should use it
+  // for inference at a time therefore we need a mutex. More details at
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety
+  std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> execution_context
       GUARDED_BY(mu);
 };
 
@@ -172,6 +195,16 @@ class TRTEngineCacheResource : public ResourceBase {
 
   string DebugString() const override;
 
+  // Returns the EngineContext that is compatible with input_shapes.
+  // Returns nullptr if no compatible EngineContexts is found in cache.
+  EngineContext* GetEngineContext(const std::vector<TensorShape>& input_shapes);
+
+  // Returns the EngineContext that is compatible with profile_id.
+  // This function should be only called in explicit batch mode where
+  // cache size is expected to be at most one.
+  // Returns nullptr if no compatible EngineContexts is found in cache.
+  EngineContext* GetEngineContext(const int profile_id);
+
   // Keep device allocator for TRT.
   std::unique_ptr<TRTBaseAllocator> allocator_;
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 27ef726514b..70a0a9a7b65 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -76,7 +76,7 @@ Status TrtShapeOptimizationProfile::AddProfiles(
                  << ". This usually happens when profile is invalid.";
     }
   }
-  if (config->getNbOptimizationProfiles() == 0) {
+  if (!profiles_.empty() && config->getNbOptimizationProfiles() == 0) {
     return errors::Internal("Failure in adding an optimization profile.");
   }
   // if TRT_VERSION < 6, then we do not need to add
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 749a7c3054a..720b81a5097 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -27,6 +27,17 @@ Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
                                            host_tensor.shape(), &xla_shape));
+  return HostTensorToBorrowingLiteral(xla_shape, host_tensor, literal);
+}
+
+Status HostTensorToBorrowingLiteral(const xla::Shape& xla_shape,
+                                    const Tensor& host_tensor,
+                                    xla::BorrowingLiteral* literal) {
+  const auto& tshape = host_tensor.shape();
+  TF_RET_CHECK(tshape.IsFullyDefined() &&
+               tshape.dims() == xla_shape.dimensions_size() &&
+               tshape.dim_sizes() == xla_shape.dimensions())
+      << "Provided xla::Shape must have the same dims as the Tensor shape.";
   *literal = xla::BorrowingLiteral(
       static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index a153dddee61..b1fdf47f5b6 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -30,6 +30,12 @@ namespace tensorflow {
 // 'host_tensor'.
 Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
                                     xla::BorrowingLiteral* literal);
+// Similar as above, except the literal shape is explicitly provided and used
+// instead of obtaining it from the 'host_tensor'. The provided literal shape
+// 'xla_shape' must be compatible with the shape of 'host_tensor'.
+Status HostTensorToBorrowingLiteral(const xla::Shape& xla_shape,
+                                    const Tensor& host_tensor,
+                                    xla::BorrowingLiteral* literal);
 
 // Returns a Literal with the contents of 'host_tensor', backed by its own
 // storage (i.e., not reusing 'host_tensor's buffers.)
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 7b53f8504ea..0a47920bd9a 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -97,6 +97,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index bb3d3317ec5..cd52e2f5e45 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -70,6 +70,12 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_num_partitions(
   return *this;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_assignment(
+    const DeviceAssignment& device_assignment) {
+  device_assignment_ = device_assignment;
+  return *this;
+}
+
 string ExecutableBuildOptions::ToString() const {
   string result_layout = "nullopt";
   if (result_layout_set_) {
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 461fd834115..360ad0260df 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -76,6 +77,18 @@ class ExecutableBuildOptions {
   int num_partitions() const { return num_partitions_; }
   ExecutableBuildOptions& set_num_partitions(int num_partitions);
 
+  // If set, this specifies a static device assignment for the computation.
+  // Otherwise, the computation will be compiled generically and can be run with
+  // any device assignment compatible with the computation's replica and
+  // partition counts.
+  bool has_device_assignment() const { return device_assignment_.has_value(); }
+  ExecutableBuildOptions& set_device_assignment(
+      const DeviceAssignment& device_assignment);
+  const DeviceAssignment& device_assignment() const {
+    CHECK(device_assignment_.has_value());
+    return device_assignment_.value();
+  }
+
   // Whether input and output buffers are aliased if the associated parameter is
   // passed-through XLA modules without being changed.
   bool alias_passthrough_params() const { return alias_passthrough_params_; }
@@ -91,6 +104,7 @@ class ExecutableBuildOptions {
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
+  absl::optional<DeviceAssignment> device_assignment_;
   bool alias_passthrough_params_ = false;
 };
 
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 1cfb449ebd0..452c87b23b7 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -100,6 +100,17 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_gpu_executable_run_options(
+    const GpuExecutableRunOptions* gpu_executable_run_options) {
+  gpu_executable_run_options_ = gpu_executable_run_options;
+  return *this;
+}
+
+const GpuExecutableRunOptions*
+ExecutableRunOptions::gpu_executable_run_options() const {
+  return gpu_executable_run_options_;
+}
+
 ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
   rng_seed_ = rng_seed;
   return *this;
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index ed67bfbeb0d..b44d5f13b68 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -38,6 +38,7 @@ namespace xla {
 
 class DeviceAssignment;
 class ExecutionProfile;
+class GpuExecutableRunOptions;
 
 // A unique identifier for a particular "logical execution" of an XLA model.
 //
@@ -137,6 +138,12 @@ class ExecutableRunOptions {
     return then_execute_function_;
   }
 
+  // GPU-backend specific options. These are kept out-of-line to avoid bloating
+  // the size of this dependency for CPU-only AOT builds.
+  ExecutableRunOptions& set_gpu_executable_run_options(
+      const GpuExecutableRunOptions* gpu_executable_run_options);
+  const GpuExecutableRunOptions* gpu_executable_run_options() const;
+
  private:
   stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -148,6 +155,7 @@ class ExecutableRunOptions {
   stream_executor::Stream* host_to_device_stream_ = nullptr;
   ThenExecuteFunction* then_execute_function_ = nullptr;
   RunId run_id_;
+  const GpuExecutableRunOptions* gpu_executable_run_options_ = nullptr;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index 6a4ad3bc22b..40bf8f0c42b 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -36,6 +36,5 @@ upper_tabs:
         path: /xla/tutorials/autoclustering_xla
       - title: Use XLA with tf.function
         path: /xla/tutorials/compile
-        status: experimental
 
 - include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
index 24de889d2f8..b7868fedb8b 100644
--- a/tensorflow/compiler/xla/g3doc/index.md
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -47,27 +47,13 @@ removing memory operations is one of the best ways to improve performance.
 A simplest way to start using XLA in TensorFlow models is to enable
 _auto-clustering_, which automatically finds _clusters_ (connected subgraphs)
 within the TensorFlow graph which can be compiled and executed using XLA.
-Auto-clustering on GPU can be enabled by either modifying the `TF_XLA_FLAGS`
-environment variable:
+Auto-clustering on GPU can be enabled by setting the `TF_XLA_FLAGS` environment
+variable:
 
 ```
 $ TF_XLA_FLAGS=--tf_xla_auto_jit=2 path/to/your/tf/program
 ```
 
-Or by setting a configuration value within the program:
-
-```
-import tensorflow as tf
-
-tf.config.optimizer.set_jit(True)
-
-# ... the rest of your program ...
-```
-
-Note: The JIT level is cached for a session, and can only be set in the very
-beginning of the program. In order to change it midway through, the session
-needs to be cleared: `tf.keras.backend.clear_session()`
-
 Auto-clustering is currently optimized for GPU workloads, but it can also be
 enabled on CPU by additionally using the flag `--tf_xla_cpu_global_jit`:
 
@@ -75,27 +61,63 @@ enabled on CPU by additionally using the flag `--tf_xla_cpu_global_jit`:
 $ TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" path/to/your/program
 ```
 
-Auto-clustering support on a CPU and on multi-GPU environments is experimental.
+Note: Auto-clustering support on CPU and on multi-GPU environments is
+experimental.
 
-For a detailed usage example, see the
-[auto-clustering tutorial colab](./tutorials/autoclustering_xla.ipynb).
+For a detailed usage example see the [auto-clustering tutorial
+colab](./tutorials/autoclustering_xla.ipynb).
 
-### Explicit compilation
+### Explicit compilation with tf.function
+
+Auto-clustering is a great tool for making the model faster without any changes
+to the code, but it may be hard to understand what changes have been performed.
 
 Explicit compilation API offers a more fine-grained control for choosing which
-functions should be compiled with XLA. However, it might require restructuring
-of the source code, as not all TensorFlow operations can be represented in XLA.
+functions should be compiled.
+For example, the following TensorFlow function which performs the MNIST training
+is compiled with XLA:
 
-Note: Using the explicit compilation on API on functions which can not be
-represented in XLA results in an exception.
+```
+@tf.function(experimental_compile=True)
+def train_mnist(images, labels):
+    images, labels = cast(images, labels)
 
-Optimizing sections of the program using
-[`tf.function`](https://www.tensorflow.org/api_docs/python/tf/function) is a
-standard approach for [improving
-performance](https://www.tensorflow.org/tutorials/customization/performance) of
-TF2 programs. You can enable compilation with XLA by setting the
-`experimental_compile` argument of `tf.function` to `True`. See the [tutorial
-colab](./tutorials/compile.ipynb) for usage examples.
+    with tf.GradientTape() as tape:
+      predicted_labels = layer(images)
+      loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=predicted_labels, labels=labels
+      ))
+    layer_variables = layer.trainable_variables
+    grads = tape.gradient(loss, layer_variables)
+    optimizer.apply_gradients(zip(grads, layer_variables))
+```
+
+The `experimental_compile` API has _must-compile_ semantics: either the entire
+function is compiled with XLA, or an `errors.InvalidArgumentError` exception is
+thrown. XLA can not currently compile functions where dimensions are not
+_inferrable_: that is, if it's not possible to infer the dimensions of all
+tensors without running the entire computation. For example, the following
+function will not compile:
+
+```
+@tf.function
+def not_compilable(x):
+  return tf.unique(x)
+```
+
+Shapes can vary across the runs though:
+
+```
+@tf.function(experimental_compile=True)
+def recompiled_on_launch(a, b):
+  return a + b
+
+recompiled_on_launch(tf.ones([1, 10]), tf.ones([1, 10]))
+recompiled_on_launch(tf.ones([1, 100]), tf.ones([1, 100]))
+```
+
+See the [tutorial colab](./tutorials/compile.ipynb) for a more detailed usage
+example.
 
 ### AOT (Ahead-of-time) compilation for CPU with `tfcompile`
 
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
index 78f1bca1478..c0160f2766c 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
@@ -45,9 +45,9 @@
       "source": [
         "# Classifying CIFAR-10 with XLA\n",
         "\n",
-        "In this colab we train a TensorFlow model to classify the [CIFAR-10](https://en.wikipedia.org/wiki/CIFAR-10) dataset, and we compile it using XLA.\n",
+        "This tutorial trains a TensorFlow model to classify the [CIFAR-10](https://en.wikipedia.org/wiki/CIFAR-10) dataset, and we compile it using XLA.\n",
         "\n",
-        "We start by loading and normalizing the dataset using the Keras API:"
+        "Load and normalize the dataset using the Keras API:"
       ]
     },
     {
@@ -197,7 +197,8 @@
       },
       "outputs": [],
       "source": [
-        "tf.keras.backend.clear_session() # We need to clear the session to enable JIT in the middle of the program.\n",
+        "# We need to clear the session to enable JIT in the middle of the program.\n",
+        "tf.keras.backend.clear_session()\n",
         "tf.config.optimizer.set_jit(True) # Enable XLA.\n",
         "model = compile_model(generate_model())\n",
         "(x_train, y_train), (x_test, y_test) = load_data()\n",
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/compile.ipynb
index 90af27ce237..59523a549d8 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/compile.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/compile.ipynb
@@ -87,7 +87,6 @@
       "outputs": [],
       "source": [
         "import tensorflow as tf\n",
-        "\n",
         "tf.compat.v1.enable_eager_execution()"
       ]
     },
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 44f7061d1ac..d6c1a034859 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -353,6 +353,7 @@ pybind_extension(
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
+        "//tensorflow/compiler/xla/client/lib:sorting",
         "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index 39da7f086b5..a35b20f6aa1 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -929,6 +929,7 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
     VLOG(2) << "PyLocalExecutable::Compile using default device_assignment:\n"
             << device_assignment->ToString();
   }
+  options.set_device_assignment(device_assignment.value());
 
   if (!argument_layouts) {
     TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 00e38a5f90d..f6e2fab7ef0 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -25,6 +25,11 @@ namespace xla {
 namespace py = pybind11;
 
 PYBIND11_MODULE(tpu_client_extension, m) {
+  // Initializes the NumPy API for the use of the types module.
+  if (!InitializeNumpyAPIForTypes()) {
+    throw std::runtime_error("Unable to initialize Numpy API");
+  }
+
   py::class_<PyTpuClient, std::shared_ptr<PyTpuClient>>(m, "TpuClient")
       .def_static("Get", &PyTpuClient::Get, py::arg("worker"))
       .def("device_count", &PyTpuClient::device_count)
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index cd85edad13e..4be375ac15a 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/lib/svd.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -454,6 +455,7 @@ void BuildOpsSubmodule(py::module* m) {
       },
       py::arg("builder"), py::arg("operands"), py::arg("dimension") = -1,
       py::arg("comparator") = absl::nullopt);
+  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
   ops.def("Transpose", &Transpose);
   ops.def("TriangularSolve", &TriangularSolve);
   ops.def("Tuple", &Tuple);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 6574ccfe898..9d53f9bd082 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1725,6 +1725,7 @@ _OTHER_OPS = [
     'Rev',
     'Select',
     'SliceInDim',
+    'TopK',
 ]
 
 
diff --git a/tensorflow/compiler/xla/refcounting_hash_map.h b/tensorflow/compiler/xla/refcounting_hash_map.h
index 3ff6a50d85f..efa1b9e3a50 100644
--- a/tensorflow/compiler/xla/refcounting_hash_map.h
+++ b/tensorflow/compiler/xla/refcounting_hash_map.h
@@ -42,13 +42,7 @@ template <typename K, typename V>
 class RefcountingHashMap {
  public:
   // Default-constructs new values.
-  RefcountingHashMap()
-      : value_factory_([](const K&) { return absl::make_unique<V>(); }) {}
-
-  // Constructs new values according to the given factory function.
-  explicit RefcountingHashMap(
-      std::function<std::unique_ptr<V>(const K&)> value_factory)
-      : value_factory_(std::move(value_factory)) {}
+  RefcountingHashMap() = default;
 
   // Not copyable or movable because this contains internal pointers (namely,
   // instances of Deleter contain pointers to `this` and into `map_`).
@@ -60,8 +54,10 @@ class RefcountingHashMap {
   // Gets the value for the given key.
   //
   // If the map doesn't contain a live value for the key, constructs one
-  // according to the factory passed to the map's constructor.
-  std::shared_ptr<V> operator[](const K& key) {
+  // using `value_factory`.
+  std::shared_ptr<V> GetOrCreateIfAbsent(
+      const K& key,
+      const std::function<std::unique_ptr<V>(const K&)>& value_factory) {
     absl::MutexLock lock(&mu_);
     auto it = map_.find(key);
     // We ensure that the entry has not expired in case deleter was running when
@@ -76,7 +72,7 @@ class RefcountingHashMap {
     // Create entry in the map and then set its value, so the value can
     // contain a pointer back into the map.
     it = map_.emplace(key, std::weak_ptr<V>()).first;
-    std::shared_ptr<V> value(value_factory_(key).release(),
+    std::shared_ptr<V> value(value_factory(key).release(),
                              Deleter{&it->first, this});
     it->second = value;  // Set the weak ptr to the shared ptr.
     return value;
@@ -112,7 +108,6 @@ class RefcountingHashMap {
     }
   };
 
-  std::function<std::unique_ptr<V>(const K&)> value_factory_;
   absl::Mutex mu_;
   absl::node_hash_map<K, std::weak_ptr<V>> map_ ABSL_GUARDED_BY(mu_);
 };
diff --git a/tensorflow/compiler/xla/refcounting_hash_map_test.cc b/tensorflow/compiler/xla/refcounting_hash_map_test.cc
index 753c30dafbe..acb7d7afb46 100644
--- a/tensorflow/compiler/xla/refcounting_hash_map_test.cc
+++ b/tensorflow/compiler/xla/refcounting_hash_map_test.cc
@@ -47,22 +47,25 @@ struct DeleteNotifier {
 
 TEST(RefcountingHashMapTest, PointerIdentity) {
   RefcountingHashMap<int, int> m;
-  std::shared_ptr<int> a = m[0];
-  std::shared_ptr<int> b = m[0];
-  std::shared_ptr<int> c = m[1];
+  auto factory = [](const int&) { return absl::make_unique<int>(); };
+  std::shared_ptr<int> a = m.GetOrCreateIfAbsent(0, factory);
+  std::shared_ptr<int> b = m.GetOrCreateIfAbsent(0, factory);
+  std::shared_ptr<int> c = m.GetOrCreateIfAbsent(1, factory);
   EXPECT_EQ(a.get(), b.get());
   EXPECT_NE(a.get(), c.get());
 }
 
 TEST(RefcountingHashMapTest, DefaultInitialized) {
   RefcountingHashMap<int, int> m;
-  EXPECT_EQ(*m[42], 0);
+  auto factory = [](const int&) { return absl::make_unique<int>(); };
+  EXPECT_EQ(*m.GetOrCreateIfAbsent(42, factory), 0);
 }
 
 TEST(RefcountingHashMapTest, DeletesEagerly) {
   RefcountingHashMap<int, DeleteNotifier> m;
   bool deleted = false;
-  auto handle = m[0];
+  auto factory = [](const int&) { return absl::make_unique<DeleteNotifier>(); };
+  auto handle = m.GetOrCreateIfAbsent(0, factory);
   handle->fn = [&] { deleted = true; };
   EXPECT_FALSE(deleted);
   handle = nullptr;
@@ -70,10 +73,10 @@ TEST(RefcountingHashMapTest, DeletesEagerly) {
 }
 
 TEST(RefcountingHashMapTest, CustomFactory) {
-  RefcountingHashMap<int, int> m(
-      [](const int& x) { return absl::make_unique<int>(x + 1); });
-  EXPECT_EQ(*m[0], 1);
-  EXPECT_EQ(*m[100], 101);
+  RefcountingHashMap<int, int> m;
+  auto factory = [](const int& x) { return absl::make_unique<int>(x + 1); };
+  EXPECT_EQ(*m.GetOrCreateIfAbsent(0, factory), 1);
+  EXPECT_EQ(*m.GetOrCreateIfAbsent(100, factory), 101);
 }
 
 TEST(RefcountingHashMapTest, ForEachEmpty) {
@@ -85,8 +88,9 @@ TEST(RefcountingHashMapTest, ForEachEmpty) {
 
 TEST(RefcountingHashMapTest, ForEachNonempty) {
   RefcountingHashMap<int, int> m;
-  auto a = m[0];
-  auto b = m[1];
+  auto factory = [](const int&) { return absl::make_unique<int>(); };
+  auto a = m.GetOrCreateIfAbsent(0, factory);
+  auto b = m.GetOrCreateIfAbsent(1, factory);
 
   std::vector<int> seen_keys;
   std::vector<int*> seen_values;
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bb6219eb584..e7f945bfa99 100755
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -27,9 +27,7 @@ package_group(
     includes = [
         "//tensorflow/compiler/xla:friends",
     ],
-    packages = [
-        "//learning/brain/experimental/tf_runtime/...",
-    ],
+    packages = ["//learning/brain/experimental/tf_runtime/..."],
 )
 
 tf_proto_library_cc(
@@ -1953,6 +1951,7 @@ cc_library(
     hdrs = ["all_reduce_combiner.h"],
     deps = [
         ":hlo",
+        ":hlo_casting_utils",
         ":hlo_domain_map",
         ":hlo_pass",
         ":hlo_query",
@@ -4347,6 +4346,7 @@ cc_library(
         ":call_graph",
         ":hlo",
         ":hlo_pass",
+        ":hlo_query",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -4586,6 +4586,7 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",  # fixdeps: keep
         "//tensorflow/stream_executor/lib",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index cfbcb5a4fe2..fd373671b97 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -3204,53 +3204,6 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
     return false;
   }
 
-  if (slice->operand(0)->opcode() == HloOpcode::kPad) {
-    VLOG(10) << "Trying to simplify scalar slice of pad";
-    // Check there's no internal padding. Again, we could handle that too, since
-    // everything is statically known, but it's not worth it.
-    auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
-    auto padding_config = pad->padding_config();
-    int64 rank = padding_config.dimensions_size();
-    if (HasInteriorPadding(padding_config)) {
-      VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
-      return false;
-    }
-
-    // Check whether the scalar we're slicing out falls into the padding.
-    bool in_padding = [&]() {
-      for (int64 i = 0; i < rank; ++i) {
-        int64 start = slice->slice_starts(i);
-        int64 low = padding_config.dimensions(i).edge_padding_low();
-        int64 data = pad->operand(0)->shape().dimensions(i);
-        if (start < low || start >= low + data) {
-          return true;
-        }
-      }
-      return false;
-    }();
-
-    if (in_padding) {
-      VLOG(10) << "Folding scalar slice of pad into padding value";
-      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
-          slice, HloInstruction::CreateReshape(slice->shape(),
-                                               pad->mutable_padding_value())));
-      return true;
-    } else {
-      // We already know the output of the slice is scalar. If the padded
-      // value is scalar, and it's not in the padding, then it's exactly the
-      // output value.
-      bool replaced =
-          ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
-      if (replaced) {
-        VLOG(10) << "Folding scalar slice of pad into padded value";
-      } else {
-        VLOG(10) << "Not folding scalar slice of pad into padded value as they "
-                    "have different shapes.";
-      }
-      return replaced;
-    }
-  }
-
   if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
     VLOG(10) << "Trying to simplify scalar slice of concat";
     // Only do this for R1, there's no chance of this being useful otherwise.
@@ -3356,20 +3309,54 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   HloInstruction* pad;
   HloInstruction* pad_operand;
   if (Match(slice, m::Slice(m::Pad(&pad, m::Op(&pad_operand), m::Op())))) {
+    // Is the result of the slice the pad operand.
     bool slice_undoes_pad = true;
+    // Can the slice be moved to the pad_operand without any padding being read.
+    bool slice_inside_pad = true;
+    // Does this slice slice out pading only.
+    bool slice_in_padding = false;
+    std::vector<int64> new_starts = slice->slice_starts();
+    std::vector<int64> new_limits = slice->slice_limits();
     for (int64 i = 0; i < slice->shape().rank(); ++i) {
-      if (slice->slice_starts(i) !=
-          pad->padding_config().dimensions(i).edge_padding_low()) {
+      const int64 start = slice->slice_starts(i);
+      const int64 stride = slice->slice_strides(i);
+      const int64 limit = slice->slice_limits(i);
+      const int64 size = pad->shape().dimensions(i);
+
+      const auto& dim = pad->padding_config().dimensions(i);
+      const int64 low = dim.edge_padding_low();
+      const int64 high = dim.edge_padding_high();
+      const int64 interior = dim.interior_padding();
+      const int64 edge = size - high;
+
+      if (limit <= low || start >= edge) {
+        slice_in_padding = true;
+        break;
+      }
+
+      if (start != low || stride - 1 != interior) {
         slice_undoes_pad = false;
       }
-      if (slice->slice_strides(i) - 1 !=
-          pad->padding_config().dimensions(i).interior_padding()) {
-        slice_undoes_pad = false;
+
+      if (start < low || limit > edge || interior != 0 || stride != 1) {
+        slice_inside_pad = false;
       }
+      new_starts[i] -= low;
+      new_limits[i] -= low;
+    }
+    if (slice_in_padding) {
+      return ReplaceInstruction(
+          slice, MakeBroadcastHlo(pad->mutable_operand(1), {}, slice->shape()));
     }
     if (slice_undoes_pad && ReplaceInstructionIfSameShape(slice, pad_operand)) {
       return Status::OK();
     }
+    if (slice_inside_pad) {
+      TF_ASSIGN_OR_RETURN(HloInstruction * new_slice,
+                          MakeSliceHlo(pad_operand, new_starts, new_limits,
+                                       slice->slice_strides()));
+      return ReplaceInstruction(slice, new_slice);
+    }
   }
 
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 8f66f8084f3..31fa125b3e1 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -4389,7 +4389,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
@@ -4410,7 +4410,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
@@ -4429,7 +4429,31 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
 
   AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfPad) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[2,3] slice(f32[8,10] pad), slice={[4:6],[2:5]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(0))));
+  EXPECT_THAT(root->slice_starts(), ElementsAre(1, 1));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
@@ -4450,7 +4474,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
@@ -4494,7 +4518,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadSomeDimsInPadding) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::ConstantScalar(-7.0))));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::ConstantScalar(-7.0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.cc b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
index 2b41f19f288..9d8f03c92ca 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
@@ -26,8 +26,10 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
@@ -80,7 +82,9 @@ Status CombineAllReduces(absl::Span<HloInstruction* const> to_combine) {
   combined = computation.AddInstruction(HloInstruction::CreateAllReduce(
       ShapeUtil::MakeTupleShape(operand_shapes), operands, reduction,
       to_combine.front()->replica_groups(),
-      /*constrain_layout=*/false, to_combine.front()->channel_id()));
+      /*constrain_layout=*/false, to_combine.front()->channel_id(),
+      Cast<HloAllReduceInstruction>(to_combine.front())
+          ->use_global_device_ids()));
 
   // We have to propagate the sharding manually because Domain instructions are
   // not guaranteed to preserve it for side effecting instructions.
@@ -106,6 +110,8 @@ struct GroupKey {
         accum_type(hlo->to_apply()->root_instruction()->shape().element_type()),
         domain_id(domain_map.GetDomainMetadataId(hlo)),
         is_cross_shard(hlo->channel_id().has_value()),
+        use_global_device_ids(
+            Cast<HloAllReduceInstruction>(hlo)->use_global_device_ids()),
         replica_groups(hlo->replica_groups()) {}
 
   bool operator<(const GroupKey& other) const {
@@ -121,6 +127,9 @@ struct GroupKey {
     if (is_cross_shard != other.is_cross_shard) {
       return is_cross_shard < other.is_cross_shard;
     }
+    if (use_global_device_ids != other.use_global_device_ids) {
+      return use_global_device_ids < other.use_global_device_ids;
+    }
     if (replica_groups.size() != other.replica_groups.size()) {
       return replica_groups.size() < other.replica_groups.size();
     }
@@ -143,6 +152,7 @@ struct GroupKey {
   PrimitiveType accum_type;
   int64 domain_id;
   bool is_cross_shard;
+  bool use_global_device_ids;
   std::vector<ReplicaGroup> replica_groups;
 };
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
index 0793ba2ba4b..b486612ff83 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
@@ -76,7 +76,8 @@ HloInstruction* MakeCrossReplicaReductions(
     inputs->push_back(input);
     all_reduces.push_back(b->AddInstruction(HloInstruction::CreateAllReduce(
         shape, {input}, reduction, /*replica_groups=*/{},
-        /*constrain_layout=*/false, /*channel_id=*/nullopt)));
+        /*constrain_layout=*/false, /*channel_id=*/nullopt,
+        /*use_global_device_ids=*/false)));
   }
   return b->AddInstruction(HloInstruction::CreateTuple(all_reduces));
 }
@@ -219,11 +220,12 @@ TEST_F(AllReduceCombinerTest, NoDependentCombination) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
   auto all_reduce = b.AddInstruction(HloInstruction::CreateAllReduce(
       constant->shape(), {constant}, reduction, /*replica_groups=*/{},
-      /*constrain_layout=*/false, /*channel_id=*/nullopt));
+      /*constrain_layout=*/false, /*channel_id=*/nullopt,
+      /*use_global_device_ids=*/false));
   b.AddInstruction(HloInstruction::CreateAllReduce(
       constant->shape(), {all_reduce}, reduction,
       /*replica_groups=*/{}, /*constrain_layout=*/false,
-      /*channel_id=*/nullopt));
+      /*channel_id=*/nullopt, /*use_global_device_ids=*/false));
 
   module->AddEntryComputation(b.Build());
 
@@ -242,16 +244,16 @@ TEST_F(AllReduceCombinerTest, GroupAllReduce) {
 
   auto constant = b.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
-  auto crs0 = b.AddInstruction(
-      HloInstruction::CreateAllReduce(constant->shape(), {constant}, reduction,
-                                      CreateReplicaGroups({{0, 1}, {2, 3}}),
-                                      /*constrain_layout=*/false,
-                                      /*channel_id=*/nullopt));
-  auto crs1 = b.AddInstruction(
-      HloInstruction::CreateAllReduce(constant->shape(), {constant}, reduction,
-                                      CreateReplicaGroups({{0, 2}, {1, 3}}),
-                                      /*constrain_layout=*/false,
-                                      /*channel_id=*/nullopt));
+  auto crs0 = b.AddInstruction(HloInstruction::CreateAllReduce(
+      constant->shape(), {constant}, reduction,
+      CreateReplicaGroups({{0, 1}, {2, 3}}),
+      /*constrain_layout=*/false,
+      /*channel_id=*/nullopt, /*use_global_device_ids=*/false));
+  auto crs1 = b.AddInstruction(HloInstruction::CreateAllReduce(
+      constant->shape(), {constant}, reduction,
+      CreateReplicaGroups({{0, 2}, {1, 3}}),
+      /*constrain_layout=*/false,
+      /*channel_id=*/nullopt, /*use_global_device_ids=*/false));
   b.AddInstruction(HloInstruction::CreateTuple({crs0, crs1}));
 
   module->AddEntryComputation(b.Build());
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index ec8c391a542..40a40186347 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -32,6 +34,96 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
+namespace {
+
+// In SPMD mode, if there's a cross-replica all-reduce that produces the same
+// value for all partitions, replaces it with a global all-reduce and then
+// divide by the number of partitions. Depending on the topology and the
+// implementation of the all-reduce for the backend, this may give a better
+// performance.
+StatusOr<bool> ReplaceReplicatedAllReduce(HloModule* module,
+                                          int64 replica_count,
+                                          int64 partition_count) {
+  TF_ASSIGN_OR_RETURN(
+      auto replication_analysis,
+      HloReplicationAnalysis::Run(module, /*cross_partition_spmd=*/true));
+
+  bool changed = false;
+  int64 next_channel = hlo_query::NextChannelId(*module);
+  for (auto computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      if (auto ar = DynCast<HloAllReduceInstruction>(instruction)) {
+        const Shape& shape = ar->shape();
+        if (ar->channel_id()) {
+          continue;
+        }
+        if (ar->replica_groups().size() > 1) {
+          continue;
+        }
+        if (shape.IsTuple() || shape.element_type() != F32) {
+          continue;
+        }
+        // We would need a cost model for the target, but in general we want to
+        // rewrite only if the replica count in the original op was large.
+        if (replica_count < 8 * partition_count) {
+          continue;
+        }
+        if (replication_analysis->HloInstructionIsReplicatedAt(ar, {})) {
+          VLOG(2) << "Replaced replicated all-reduce:" << ar->ToString();
+          ar->set_channel_id(next_channel++);
+          auto divisor =
+              computation->AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::CreateR0<float>(partition_count)));
+          auto bcast = computation->AddInstruction(
+              HloInstruction::CreateBroadcast(shape, divisor, {}));
+          auto div = computation->AddInstruction(HloInstruction::CreateBinary(
+              ar->shape(), HloOpcode::kDivide, ar, bcast));
+          TF_RETURN_IF_ERROR(ar->ReplaceAllUsesWith(div));
+          changed = true;
+        }
+      }
+    }
+  }
+  return changed;
+}
+
+// Returns true if the given instruction (must be a cross-partition all-reduce)
+// has a ReplicaGroup config that can be combined with cross-replica all-reduce.
+// We currently restrict to those groups where all partitions in each replica
+// belong to the same group.
+bool HasCombinableReplicaGroup(HloInstruction* hlo, int64 num_replicas,
+                               int64 num_partitions) {
+  auto all_reduce = Cast<HloAllReduceInstruction>(hlo);
+  auto replica_groups = all_reduce->replica_groups();
+  CHECK(all_reduce->IsCrossModuleAllReduce());
+
+  if (all_reduce->use_global_device_ids()) {
+    if (replica_groups.size() != num_replicas) {
+      return false;
+    }
+    for (auto group : replica_groups) {
+      if (group.replica_ids_size() != num_partitions) {
+        return false;
+      }
+      std::unordered_set<int64> partition_ids;
+      int64 replica_id = group.replica_ids(0) / num_partitions;
+      for (int64 i = 0; i < num_partitions; ++i) {
+        if (group.replica_ids(i) / num_partitions != replica_id) {
+          return false;
+        }
+        partition_ids.insert(group.replica_ids(i) % num_partitions);
+      }
+      if (partition_ids.size() != num_partitions) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  return replica_groups.size() == num_replicas;
+}
+
+}  // namespace
 
 namespace m = match;
 
@@ -73,7 +165,8 @@ absl::optional<ArCrsCombiner::ArCrsPair> ArCrsCombiner::MatchesArCrsPattern(
   // belongs to its own group, since the later cross-replica all-reduce combines
   // along the replica dimension.
   if (instruction->IsCrossModuleAllReduce() &&
-      instruction->replica_groups().size() == num_replicas_ &&
+      HasCombinableReplicaGroup(instruction, num_replicas_,
+                                num_spatial_partitions_) &&
       computation_is_addition(instruction->called_computations()[0]) &&
       instruction->user_count() == 1) {
     auto next = instruction->users()[0];
@@ -491,6 +584,12 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
         next = next->users()[0];
       }
       // The AllReduce and the CRS are combined to an all-core AllReduce.
+      //
+      // Note that we can just reuse the ReplicaGroup config of cross-replica
+      // all-reduce since we already checked that cross-partition all-reduce
+      // is always across all partitions (HasCombinableReplicaGroup). We need to
+      // combine ReplicaGroup configs using global ids here if we relax that
+      // restriction.
       next->set_channel_id(channel_id);
     }
   }
@@ -508,7 +607,16 @@ StatusOr<bool> ArCrsCombiner::Run(HloModule* module) {
     TF_RETURN_IF_ERROR(KeepProvablyEqualInstructionGroupsMPMD());
   }
 
-  return RewriteGraph();
+  TF_ASSIGN_OR_RETURN(auto changed, RewriteGraph());
+
+  if (num_replicas_ > 1 && spmd_partition_) {
+    TF_ASSIGN_OR_RETURN(auto replaced,
+                        ReplaceReplicatedAllReduce(module, num_replicas_,
+                                                   num_spatial_partitions_));
+    changed |= replaced;
+  }
+
+  return changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 609da2c33a0..a02d5a86a27 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -1711,9 +1711,9 @@ HloModule foobar
 
 ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
   %p = bf16[] parameter(0)
-  %all-reduce.0 = f32[] all-reduce(%p), channel_id=1, replica_groups={{0,1}},
+  %all-reduce.0 = f32[] all-reduce(%p), channel_id=1, replica_groups={{0},{1}},
     to_apply=%sum.f32
-  %all-reduce.2 = f32[] all-reduce(%all-reduce.0), replica_groups={{0,1}},
+  %all-reduce.2 = f32[] all-reduce(%all-reduce.0), replica_groups={{0},{1}},
     to_apply=%sum.f32
   ROOT %tuple = (f32[]) tuple(%all-reduce.2)
 }
@@ -1727,5 +1727,68 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ArCrsCombinerTest, ReplaceReplicatedAllReduceSPMD) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,4]) -> f32[2,4] {
+  %p = f32[2,4] parameter(0), sharding={replicated}
+  ROOT %all-reduce = f32[2,4] all-reduce(%p), replica_groups={{0,1}},
+    to_apply=%sum.f32
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/4, /*num_replicas=*/64,
+                         /*spmd_partition=*/true);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Divide(op::AllReduce(op::Parameter()),
+                               op::Broadcast(op::Constant())));
+
+  auto ar = root->operand(0);
+  auto divisor = root->operand(1)->operand(0);
+  EXPECT_TRUE(ar->channel_id());
+  EXPECT_TRUE(divisor->literal().IsAllFloat(4));
+}
+
+TEST_F(ArCrsCombinerTest, AllReduceWithGlobalIdReplicaGroups) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
+  %p = bf16[] parameter(0)
+  %all-reduce.0 = f32[] all-reduce(%p), channel_id=1,
+    replica_groups={{0,1,2,3},{4,5,6,7}}, use_global_device_ids=true,
+    to_apply=%sum.f32
+  %all-reduce.2 = f32[] all-reduce(%all-reduce.0), replica_groups={{0,1}},
+    to_apply=%sum.f32
+  ROOT %tuple = (f32[]) tuple(%all-reduce.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/4, /*num_replicas=*/2,
+                         /*spmd_partition=*/true);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index ac5edd82bee..a522d230f5a 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -240,7 +240,7 @@ TEST_F(BFloat16ConversionFoldingTest, FoldAllReduceTupleOutput) {
       ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}, sum,
       /*replica_groups=*/{},
       /*constrain_layout=*/false,
-      /*channel_id=*/absl::nullopt));
+      /*channel_id=*/absl::nullopt, /*use_global_device_ids=*/false));
   HloInstruction* gte_a = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
   HloInstruction* gte_b = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index ec93a868022..78924908015 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -260,7 +260,8 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllReduce) {
       ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction,
       /*replica_groups=*/{},
       /*constrain_layout=*/false,
-      /*channel_id=*/absl::nullopt));
+      /*channel_id=*/absl::nullopt,
+      /*use_global_device_ids=*/false));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1));
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index aee1f652abd..048c0edc4a5 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -212,7 +212,7 @@ TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) {
       builder.AddInstruction(HloInstruction::CreateAllReduce(
           ShapeUtil::MakeTupleShape({shape, shape}), {a, b}, reduction,
           /*replica_groups=*/{}, /*constrain_layout=*/false,
-          /*channel_id=*/1));
+          /*channel_id=*/1, /*use_global_device_ids=*/false));
   HloInstruction* gte0 = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(shape, all_reduce, 0));
   HloInstruction* gte1 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.cc b/tensorflow/compiler/xla/service/collective_ops_utils.cc
index cfe586c6c0b..a4eba334f31 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.cc
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.cc
@@ -44,7 +44,7 @@ absl::optional<ReductionKind> MatchReductionComputation(
 }
 
 StatusOr<std::vector<int64>> GetParticipatingReplicas(
-    int64 device_ordinal, absl::Span<const ReplicaGroup> replica_groups,
+    GlobalDeviceId device_id, absl::Span<const ReplicaGroup> replica_groups,
     int64 total_replica_count, const DeviceAssignment& device_assn) {
   std::vector<int64> participating_replicas;
 
@@ -58,7 +58,7 @@ StatusOr<std::vector<int64>> GetParticipatingReplicas(
 
   // Use the DeviceAssignment to figure out our replica-id.
   TF_ASSIGN_OR_RETURN(int replica_id,
-                      device_assn.ReplicaIdForDeviceOrdinal(device_ordinal));
+                      device_assn.ReplicaIdForDeviceOrdinal(device_id.value()));
 
   // Figure out the other replicas that go together with this one.
   absl::optional<ReplicaGroup> replica_group;
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index 2524b4190e9..d9b6c48685b 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
@@ -37,9 +38,9 @@ absl::optional<ReductionKind> MatchReductionComputation(
     const HloComputation* computation);
 
 // Figures out which devices (named by their replica-ids) are participating in
-// the all-reduce subgroup that contains device_ordinal.
+// the all-reduce subgroup that contains device_id.
 StatusOr<std::vector<int64>> GetParticipatingReplicas(
-    int64 device_ordinal, absl::Span<const ReplicaGroup> replica_groups,
+    GlobalDeviceId device_id, absl::Span<const ReplicaGroup> replica_groups,
     int64 total_replica_count, const DeviceAssignment& device_assn);
 
 // Key that identifies a particular Rendezvous object in our global hashtable.
@@ -72,16 +73,18 @@ struct RendezvousKey {
   };
 
   explicit RendezvousKey(const RunId& run_id,
-                         std::vector<int64> participating_replicas,
+                         std::vector<GlobalDeviceId> global_devices,
+                         int num_local_participants,
                          CollectiveOpKind collective_op_kind, int64 op_id)
       : run_id(run_id),
-        participating_replicas(participating_replicas),
+        global_devices(std::move(global_devices)),
+        num_local_participants(num_local_participants),
         collective_op_kind(collective_op_kind),
         op_id(op_id) {}
 
   static RendezvousKey FromInstruction(
-      const RunId& run_id, std::vector<int64> participating_replicas,
-      const HloInstruction* instr) {
+      const RunId& run_id, std::vector<GlobalDeviceId> global_devices,
+      int num_local_participants, const HloInstruction* instr) {
     CollectiveOpKind collective_op_kind;
     int64 op_id;
 
@@ -91,20 +94,19 @@ struct RendezvousKey {
             : std::make_pair(
                   kCrossReplica,
                   static_cast<int64>(instr->GetModule()->unique_id()));
-    return RendezvousKey(run_id, participating_replicas, collective_op_kind,
-                         op_id);
+    return RendezvousKey(run_id, std::move(global_devices),
+                         num_local_participants, collective_op_kind, op_id);
   }
 
-  int num_participants() const { return participating_replicas.size(); }
-
   template <typename H>
   friend H AbslHashValue(H h, const RendezvousKey& k) {
-    return H::combine(std::move(h), k.run_id, k.participating_replicas,
+    return H::combine(std::move(h), k.run_id, k.global_devices,
+                      k.num_local_participants,
                       static_cast<int>(k.collective_op_kind), k.op_id);
   }
   friend bool operator==(const RendezvousKey& a, const RendezvousKey& b) {
-    return a.run_id == b.run_id &&
-           a.participating_replicas == b.participating_replicas &&
+    return a.run_id == b.run_id && a.global_devices == b.global_devices &&
+           a.num_local_participants == b.num_local_participants &&
            a.collective_op_kind == b.collective_op_kind &&  //
            a.op_id == b.op_id;
   }
@@ -114,14 +116,15 @@ struct RendezvousKey {
 
   string ToString() const {
     return absl::StrFormat(
-        "RendezvousKey{run_id=%s, participating_replicas=[%s], "
-        "collective_op_kind=%d, op_id=%d}",
-        run_id.ToString(), absl::StrJoin(participating_replicas, ","),
-        static_cast<int>(collective_op_kind), op_id);
+        "RendezvousKey{run_id=%s, global_devices=[%s], "
+        "num_local_participants=%d, collective_op_kind=%d, op_id=%d}",
+        run_id.ToString(), GlobalDeviceIdsToString(global_devices),
+        num_local_participants, static_cast<int>(collective_op_kind), op_id);
   }
 
   RunId run_id;
-  std::vector<int64> participating_replicas;
+  std::vector<GlobalDeviceId> global_devices;
+  int num_local_participants;
   CollectiveOpKind collective_op_kind;
   int64 op_id;
 };
@@ -164,10 +167,13 @@ struct AllReduceParticipantData {
   };
   std::vector<Buffer> buffers;
   se::Stream* stream;
+  const NcclUniqueIdCallback* nccl_unique_id_callback = nullptr;
 
   ReductionKind reduction_kind;
 
-  int num_participants() const { return rendezvous_key.num_participants(); }
+  // For each local all-reduce participant a (global ID, local device ordinal)
+  // pair for the participant. Participants are in no particular order.
+  std::vector<std::pair<GlobalDeviceId, int64>> local_devices;
 
   string ToString() const {
     std::vector<std::string> buffer_strs;
@@ -303,12 +309,13 @@ class Rendezvous {
   const RendezvousKey key_;
 
   tensorflow::BlockingCounter all_participants_present_{
-      key_.num_participants()};
-  tensorflow::BlockingCounter done_{key_.num_participants()};
+      key_.num_local_participants};
+  tensorflow::BlockingCounter done_{key_.num_local_participants};
 
   // tensorflow::BlockingCounter returned by SubmitParticipant.
   std::shared_ptr<tensorflow::BlockingCounter> returned_blocking_counter_{
-      std::make_shared<tensorflow::BlockingCounter>(key_.num_participants())};
+      std::make_shared<tensorflow::BlockingCounter>(
+          key_.num_local_participants)};
 };
 
 }  // end namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 98c23b679fa..60e184411e9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -382,10 +382,7 @@ class CpuAllReduceRendezvous : public xla::Rendezvous<std::nullptr_t> {
 xla::RefcountingHashMap<xla::RendezvousKey, CpuAllReduceRendezvous>&
 GlobalRendezvousMap() {
   static auto& m =
-      *new xla::RefcountingHashMap<xla::RendezvousKey, CpuAllReduceRendezvous>(
-          [](const xla::RendezvousKey& k) {
-            return absl::make_unique<CpuAllReduceRendezvous>(k);
-          });
+      *new xla::RefcountingHashMap<xla::RendezvousKey, CpuAllReduceRendezvous>;
   return m;
 }
 
@@ -411,18 +408,28 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllReduce(
 
   std::vector<xla::ReplicaGroup> group =
       xla::ParseReplicaGroupsOnly(replica_groups_serialized).ValueOrDie();
-  xla::int32 replica_count = run_options->device_assignment()->replica_count();
-  std::vector<xla::int64> participating_replicas_vec =
-      xla::GetParticipatingReplicas(device_ordinal, group, replica_count,
+  const xla::DeviceAssignment& device_assignment =
+      *run_options->device_assignment();
+  xla::int32 replica_count = device_assignment.replica_count();
+  CHECK_EQ(device_assignment.computation_count(), 1);
+  std::vector<xla::int64> participating_replicas =
+      xla::GetParticipatingReplicas(xla::GlobalDeviceId(device_ordinal), group,
+                                    replica_count,
                                     *run_options->device_assignment())
           .ValueOrDie();
 
   xla::RendezvousKey::CollectiveOpKind op_kind =
       channel_id_present ? xla::RendezvousKey::kCrossModule
                          : xla::RendezvousKey::kCrossReplica;
-  xla::RendezvousKey rendezvous_key(run_options->run_id(),
-                                    participating_replicas_vec, op_kind, op_id);
-
+  std::vector<xla::GlobalDeviceId> participating_devices;
+  participating_devices.reserve(participating_replicas.size());
+  for (xla::int64 replica : participating_replicas) {
+    participating_devices.push_back(
+        xla::GlobalDeviceId(device_assignment(replica, 0)));
+  }
+  xla::RendezvousKey rendezvous_key(
+      run_options->run_id(), std::move(participating_devices),
+      participating_replicas.size(), op_kind, op_id);
   auto shape_str = ShapeString(shape_ptr, shape_length);
   VLOG(2) << "All-reduce input/output shape : " << shape_str;
 
@@ -444,10 +451,17 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllReduce(
   participant.buffers = {buffer};
   participant.reduction_kind = static_cast<xla::ReductionKind>(reduction_kind);
 
-  TF_CHECK_OK(
-      CpuAllReduceRendezvous::SubmitParticipant(
-          [&] { return GlobalRendezvousMap()[rendezvous_key]; }, participant)
-          .status());
+  auto make_cpu_rendezvous = [](const xla::RendezvousKey& k) {
+    return absl::make_unique<CpuAllReduceRendezvous>(k);
+  };
+
+  TF_CHECK_OK(CpuAllReduceRendezvous::SubmitParticipant(
+                  [&] {
+                    return GlobalRendezvousMap().GetOrCreateIfAbsent(
+                        rendezvous_key, make_cpu_rendezvous);
+                  },
+                  participant)
+                  .status());
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ReplicaId(
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
index c60580d6e76..85259afbda6 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
@@ -144,7 +144,9 @@ void MKLConvImpl(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
   if (need_output_conversion) {
     net.push_back(reorder(conv1_dst_memory, user_dst_memory));
   }
-  stream(stream::kind::eager).submit(net).wait();
+#ifndef ENABLE_MKLDNN_V1
+  stream(stream::kind::eager_nostore).submit(net).wait();
+#endif
 }
 }  // namespace
 #endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index d13eca30cdc..86da500b1dd 100755
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -54,6 +54,22 @@ tf_proto_library_cc(
     protodeps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
+cc_library(
+    name = "gpu_executable_run_options",
+    srcs = ["gpu_executable_run_options.cc"],
+    hdrs = ["gpu_executable_run_options.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "gpu_constants",
     srcs = ["gpu_constants.cc"],
@@ -385,6 +401,7 @@ cc_library(
     hdrs = ["thunk.h"],
     deps = [
         ":buffer_allocations",
+        ":gpu_executable_run_options",
         ":hlo_execution_profiler",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
@@ -413,6 +430,7 @@ tf_cuda_library(
         ":buffer_allocations",
         ":hlo_execution_profiler",
         ":thunk",
+        ":gpu_executable_run_options",
         "@com_google_absl//absl/base:core_headers",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla:refcounting_hash_map",
@@ -522,6 +540,7 @@ cc_library(
         ":cudnn_batchnorm_runner",
         ":gpu_conv_runner",
         ":gpu_debug_info_manager",
+        ":gpu_executable_run_options",
         ":gpu_types",
         ":hlo_execution_profiler",
         ":infeed_manager",
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index 2fe359861f8..2a071cd658d 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -211,10 +211,7 @@ StatusOr<std::shared_ptr<BlockingCounter>> Rendezvous::SubmitParticipant(
 // Rendezvous objects are one-time use, so they're removed from this map once
 // we're through with them.
 RefcountingHashMap<RendezvousKey, Rendezvous>& GlobalRendezvousMap() {
-  static auto& m = *new RefcountingHashMap<RendezvousKey, Rendezvous>(
-      [](const RendezvousKey& key) {
-        return absl::make_unique<Rendezvous>(key);
-      });
+  static auto& m = *new RefcountingHashMap<RendezvousKey, Rendezvous>();
   return m;
 }
 
@@ -233,7 +230,11 @@ Status CollectivePermuteThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Rendezvous with the threads for all other devices that are participating in
   // this CollectivePermute.
   RendezvousKey key{params.run_id, params.device_assn->replica_count()};
-  std::shared_ptr<Rendezvous> rendezvous = GlobalRendezvousMap()[key];
+  auto rendezvous_factory = [](const RendezvousKey& key) {
+    return absl::make_unique<Rendezvous>(key);
+  };
+  std::shared_ptr<Rendezvous> rendezvous =
+      GlobalRendezvousMap().GetOrCreateIfAbsent(key, rendezvous_factory);
 
   TF_ASSIGN_OR_RETURN(int64 replica_id,
                       params.device_assn->ReplicaIdForDeviceOrdinal(
diff --git a/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
index 7c3d76c1c92..998a3ccb4ee 100644
--- a/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
@@ -34,7 +34,7 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
 
 NcclAllReduceThunk::~NcclAllReduceThunk() = default;
 
-/*static*/ absl::flat_hash_set<int>
+/*static*/ absl::flat_hash_set<GlobalDeviceId>
 NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
   return {};
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index df44f379b99..d4797e094fd 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -196,13 +197,21 @@ Status GpuExecutable::ExecuteThunks(
     VLOG(2) << "Executing the thunk for "
             << thunk->hlo_instruction()->ToString() << " on stream "
             << stream_no;
+    const GpuExecutableRunOptions* gpu_options =
+        run_options->run_options().gpu_executable_run_options();
     Thunk::ExecuteParams thunk_params{
         &buffer_allocations,
         stream,
         run_options->run_options().run_id(),
         &profiler,
         run_options->run_options().device_assignment(),
-        &deferred_host_callbacks};
+        &deferred_host_callbacks,
+        gpu_options && gpu_options->gpu_global_device_ids()
+            ? &*gpu_options->gpu_global_device_ids()
+            : nullptr,
+        gpu_options && gpu_options->nccl_unique_id_callback()
+            ? &gpu_options->nccl_unique_id_callback()
+            : nullptr};
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params));
     if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc
new file mode 100644
index 00000000000..b152962eb99
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+
+namespace xla {
+
+std::string GlobalDeviceIdsToString(absl::Span<GlobalDeviceId const> ids) {
+  std::vector<int64> values;
+  values.reserve(ids.size());
+  for (GlobalDeviceId id : ids) {
+    values.push_back(id.value());
+  }
+  return absl::StrJoin(values, ",");
+}
+
+NcclCliqueKey::NcclCliqueKey(std::vector<GlobalDeviceId> devices)
+    : devices_(std::move(devices)) {
+  absl::c_sort(devices_);
+  CHECK(absl::c_adjacent_find(devices_) == devices_.end())
+      << "Duplicate devices are not allowed: "
+      << GlobalDeviceIdsToString(devices_);
+}
+
+GpuExecutableRunOptions& GpuExecutableRunOptions::set_gpu_global_device_ids(
+    absl::optional<std::vector<GlobalDeviceId>> gpu_global_device_ids) {
+  gpu_global_device_ids_ = std::move(gpu_global_device_ids);
+  return *this;
+}
+
+const absl::optional<std::vector<GlobalDeviceId>>&
+GpuExecutableRunOptions::gpu_global_device_ids() const {
+  return gpu_global_device_ids_;
+}
+
+GpuExecutableRunOptions& GpuExecutableRunOptions::set_nccl_unique_id_callback(
+    NcclUniqueIdCallback nccl_unique_id_callback) {
+  nccl_unique_id_callback_ = std::move(nccl_unique_id_callback);
+  return *this;
+}
+
+const NcclUniqueIdCallback& GpuExecutableRunOptions::nccl_unique_id_callback()
+    const {
+  return nccl_unique_id_callback_;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h
new file mode 100644
index 00000000000..7a43c80121b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
+
+namespace xla {
+
+// Strongly-typed integer type for naming a device globally within a distributed
+// system. XLA doesn't have a strong opinion about what global numbering scheme
+// is applied to GPUs; the user must provide a local -> global mapping via
+// GpuExecutableRunOptions for the local GPUs.
+TF_LIB_GTL_DEFINE_INT_TYPE(GlobalDeviceId, int64);
+
+// Returns a comma-separated string of global device IDs.
+std::string GlobalDeviceIdsToString(absl::Span<GlobalDeviceId const> ids);
+
+// Key for naming up a particular NCCL clique.  This is just a set of unique
+// device IDs (i.e. GPU IDs). The device IDs must be global within a cluster.
+class NcclCliqueKey {
+ public:
+  explicit NcclCliqueKey(std::vector<GlobalDeviceId> devices);
+
+  template <typename H>
+  friend H AbslHashValue(H h, const NcclCliqueKey& k) {
+    return H::combine(std::move(h), k.devices_);
+  }
+  friend bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
+    return a.devices_ == b.devices_;
+  }
+
+  const std::vector<GlobalDeviceId>& devices() const { return devices_; }
+
+ private:
+  std::vector<GlobalDeviceId> devices_;
+};
+
+using NcclUniqueIdCallback =
+    std::function<StatusOr<std::string>(const NcclCliqueKey&)>;
+
+// GPU-specific executable options.
+// We keep these separate from ExecutableRunOptions to avoid adding
+// dependencies to ExecutableRunOptions.
+class GpuExecutableRunOptions {
+ public:
+  // Sets a mapping from local device ordinals to global device IDs.
+  // Used only on NVidia GPUs for cross-host NCCL collectives. If set, the
+  // elements of `device_assignment` are interpreted as global device IDs, not
+  // local device ordinals.
+  GpuExecutableRunOptions& set_gpu_global_device_ids(
+      absl::optional<std::vector<GlobalDeviceId>> gpu_global_device_ids);
+  const absl::optional<std::vector<GlobalDeviceId>>& gpu_global_device_ids()
+      const;
+
+  // Callback that returns a ncclUniqueId encoded as a string for a group of
+  // communicating GPU devices. Used only on NVidia GPUs.
+  GpuExecutableRunOptions& set_nccl_unique_id_callback(
+      NcclUniqueIdCallback nccl_unique_id_callback);
+  const NcclUniqueIdCallback& nccl_unique_id_callback() const;
+
+ private:
+  absl::optional<std::vector<GlobalDeviceId>> gpu_global_device_ids_;
+  NcclUniqueIdCallback nccl_unique_id_callback_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 8efcd2384a3..f99e43cc06d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2346,7 +2346,9 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
               shared_cache,
               {b_.getInt32(0), constant(j), thread_id_info.lane_id}));
           llvm::Value* initial_value = reduction_info.GetInitialValues()[i];
-          llvm::Value* initial_value_addr = b_.CreateAlloca(element_type);
+          llvm::Value* initial_value_addr =
+              shared_to_global(llvm_ir::EmitAllocaAtFunctionEntry(
+                  element_type, "initial_value_addr", &b_));
           b_.CreateStore(initial_value, initial_value_addr);
 
           llvm::Value* warp_exists = b_.CreateICmpULT(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 4498793113a..52c4fb93199 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/refcounting_hash_map.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -179,27 +180,18 @@ absl::optional<ncclDataType_t> DatatypeToNccl(PrimitiveType element_type) {
   }
 }
 
-// Key for looking up a particular NCCL clique.  This is just a set of unique
-// device ordinals (i.e. GPU IDs).
-struct NcclCliqueKey {
-  explicit NcclCliqueKey(absl::Span<const int64> devices)
-      : devices(devices.begin(), devices.end()) {
-    absl::c_sort(this->devices);
-    CHECK(absl::c_adjacent_find(devices) == devices.end())
-        << "Duplicate devices are not allowed: "
-        << absl::StrJoin(devices, ", ");
+Status StringToNcclUniqueId(const std::string& str_id, ncclUniqueId* nccl_id) {
+  if (str_id.size() != NCCL_UNIQUE_ID_BYTES) {
+    return InvalidArgument(
+        "ncclUniqueId string must have %d bytes, got %d bytes", str_id.size(),
+        NCCL_UNIQUE_ID_BYTES);
   }
-
-  template <typename H>
-  friend H AbslHashValue(H h, const NcclCliqueKey& k) {
-    return H::combine(std::move(h), k.devices);
-  }
-  friend bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
-    return a.devices == b.devices;
-  }
-
-  std::vector<int64> devices;
-};
+  // NcclUniqueId is internally just a char[].
+  static_assert(sizeof(ncclUniqueId) == NCCL_UNIQUE_ID_BYTES,
+                "NCCL_UNIQUE_ID_BYTES");
+  std::memcpy(static_cast<void*>(nccl_id), str_id.data(), NCCL_UNIQUE_ID_BYTES);
+  return Status::OK();
+}
 
 // Owns a clique of NCCL comms which can be used for collective operations among
 // a particular set of GPUs.
@@ -216,20 +208,29 @@ struct NcclCliqueKey {
 // GPUs, you'll need a different clique.
 class NcclClique {
  public:
-  explicit NcclClique(absl::Span<const int64> devices)
-      : devices_(devices.begin(), devices.end()) {
-    absl::c_sort(devices_);
-    status_ = Init();
+  explicit NcclClique(
+      int64 num_global_devices, std::vector<int64> local_device_ordinals,
+      std::vector<int64> local_device_ranks,
+      const StatusOr<absl::optional<std::string>>& nccl_unique_id)
+      : num_global_devices_(num_global_devices),
+        local_device_ordinals_(std::move(local_device_ordinals)),
+        local_device_ranks_(std::move(local_device_ranks)) {
+    CHECK_EQ(local_device_ordinals_.size(), local_device_ranks_.size());
+    // It's unusual to pass a StatusOr<> into a class, but since this class
+    // already has a erroneous state, it turns out to be a little easier to
+    // implement this way than to change RefcountingHashMap.
+    status_ = Init(nccl_unique_id);
   }
 
   Status status() { return status_; }
 
-  absl::Span<const int64> devices() {
-    TF_CHECK_OK(status_);
-    return devices_;
-  }
-  ncclComm_t comm(int64 device) {
-    int64 idx = std::distance(devices_.begin(), absl::c_find(devices_, device));
+  // A NCCL communicator is the NCCL state associated with a participant (rank)
+  // in a reduction. This method returns the state associated with a particular
+  // local device ordinal.
+  ncclComm_t comm(int64 device_ordinal) {
+    int64 idx =
+        std::distance(local_device_ordinals_.begin(),
+                      absl::c_find(local_device_ordinals_, device_ordinal));
     return comms_.at(idx).comm();
   }
 
@@ -249,10 +250,12 @@ class NcclClique {
   }
 
  private:
-  Status Init() {
+  Status Init(
+      const StatusOr<absl::optional<std::string>>& maybe_nccl_unique_id) {
     VLOG(3) << absl::StreamFormat(
-        "Initializing nccl comms for participant devices {%s}",
-        absl::StrJoin(devices_, ", "));
+        "Initializing nccl comms for participant device ordinals %s ranks {%s}",
+        absl::StrJoin(local_device_ordinals_, ", "),
+        absl::StrJoin(local_device_ranks_, ", "));
 
     // Restore CUDA device after running this.  XLA shouldn't care, but maybe
     // another consumer does.
@@ -264,15 +267,23 @@ class NcclClique {
     // When using ncclGroupStart/End it seems that the ncclComm_t's are not
     // populated until the End() call.  This unfortunately makes error handling
     // tricky.
-    std::vector<ncclComm_t> raw_comms(devices_.size(), nullptr);
+    std::vector<ncclComm_t> raw_comms(local_device_ordinals_.size(), nullptr);
+    TF_ASSIGN_OR_RETURN(const absl::optional<std::string>& nccl_id_string,
+                        maybe_nccl_unique_id);
+
     ncclUniqueId nccl_id;
-    XLA_CUDA_RETURN_IF_ERROR(ncclGetUniqueId(&nccl_id));
+    if (nccl_id_string) {
+      TF_RETURN_IF_ERROR(StringToNcclUniqueId(*nccl_id_string, &nccl_id));
+    } else {
+      XLA_CUDA_RETURN_IF_ERROR(ncclGetUniqueId(&nccl_id));
+    }
     XLA_CUDA_RETURN_IF_ERROR(ncclGroupStart());
     Status status = [&] {
-      for (int i = 0; i < devices_.size(); ++i) {
-        XLA_CUDA_RETURN_IF_ERROR(cudaSetDevice(devices_[i]));
-        XLA_CUDA_RETURN_IF_ERROR(
-            ncclCommInitRank(&raw_comms[i], devices_.size(), nccl_id, i));
+      for (int i = 0; i < local_device_ordinals_.size(); ++i) {
+        XLA_CUDA_RETURN_IF_ERROR(cudaSetDevice(local_device_ordinals_[i]));
+        XLA_CUDA_RETURN_IF_ERROR(ncclCommInitRank(&raw_comms[i],
+                                                  num_global_devices_, nccl_id,
+                                                  local_device_ranks_.at(i)));
       }
       return Status::OK();
     }();
@@ -282,9 +293,9 @@ class NcclClique {
     // Populate comms_ from the raw comms we created above.  If we encountered
     // an error above we'll later clear comms_ thus destroying any raw comms
     // that were created before the error.
-    for (int i = 0; i < devices_.size(); ++i) {
-      VLOG(3) << absl::StreamFormat("Device %d assigned ncclComm %p",
-                                    devices_[i], raw_comms[i]);
+    for (int i = 0; i < local_device_ordinals_.size(); ++i) {
+      VLOG(3) << absl::StreamFormat("Device ordinal %d assigned ncclComm %p",
+                                    local_device_ordinals_[i], raw_comms[i]);
       CHECK(raw_comms[i] != nullptr || !status.ok());
       comms_.emplace_back(raw_comms[i]);
     }
@@ -296,7 +307,11 @@ class NcclClique {
   }
 
   Status status_;
-  std::vector<int64> devices_;
+  int64 num_global_devices_;
+  std::vector<int64> local_device_ordinals_;
+  // NCCL communicator rank for each local device. The rank of a device is equal
+  // to the offset of the local device in the global device set.
+  std::vector<int64> local_device_ranks_;
   std::vector<NcclComm> comms_;
 
   // This mutex is in a unique_ptr so NcclClique can be movable.
@@ -312,10 +327,7 @@ class NcclClique {
 // have one clique alive for a given set of GPUs.  This means that a process
 // will never do two collective operations concurrently on the same set of GPUs.
 RefcountingHashMap<NcclCliqueKey, NcclClique>& GlobalNcclCliqueMap() {
-  static auto& m = *new RefcountingHashMap<NcclCliqueKey, NcclClique>(
-      [](const NcclCliqueKey& key) {
-        return absl::make_unique<NcclClique>(key.devices);
-      });
+  static auto& m = *new RefcountingHashMap<NcclCliqueKey, NcclClique>();
   return m;
 }
 
@@ -341,10 +353,7 @@ class RendezvousNcclAllReduce : public Rendezvous<std::shared_ptr<NcclClique>> {
 RefcountingHashMap<RendezvousKey, RendezvousNcclAllReduce>&
 GlobalRendezvousMap() {
   static auto& m =
-      *new RefcountingHashMap<RendezvousKey, RendezvousNcclAllReduce>(
-          [](const RendezvousKey& k) {
-            return absl::make_unique<RendezvousNcclAllReduce>(k);
-          });
+      *new RefcountingHashMap<RendezvousKey, RendezvousNcclAllReduce>();
   return m;
 }
 
@@ -365,12 +374,46 @@ RendezvousNcclAllReduce::SubmitParticipantImpl(
     // ensuring that there's a NCCL clique available for us to use.
     primary = !initialized_;
 
+    TF_RET_CHECK(participant.local_devices.size() ==
+                 participant.rendezvous_key.num_local_participants);
+
     // Look up or create the NCCL clique for this set of devices.
-    std::vector<int64> devices;
-    for (const auto& p : participants_) {
-      devices.push_back(p.device_ordinal);
-    }
-    clique = GlobalNcclCliqueMap()[NcclCliqueKey(devices)];
+    NcclCliqueKey clique_key(participant.rendezvous_key.global_devices);
+
+    auto clique_factory =
+        [&](const NcclCliqueKey& key) -> std::unique_ptr<NcclClique> {
+      std::vector<int64> local_device_ranks;
+      std::vector<int64> local_device_ordinals;
+      local_device_ranks.reserve(participant.local_devices.size());
+      local_device_ordinals.reserve(participant.local_devices.size());
+      for (const auto& l : participant.local_devices) {
+        auto it =
+            absl::c_find(participant.rendezvous_key.global_devices, l.first);
+        CHECK(it != participant.rendezvous_key.global_devices.end()) << l.first;
+        local_device_ranks.push_back(std::distance(
+            participant.rendezvous_key.global_devices.begin(), it));
+        local_device_ordinals.push_back(l.second);
+      }
+      StatusOr<absl::optional<std::string>> nccl_unique_id;
+      if (participant.nccl_unique_id_callback) {
+        nccl_unique_id = (*participant.nccl_unique_id_callback)(clique_key);
+      } else {
+        if (participant.rendezvous_key.global_devices.size() !=
+            participant.rendezvous_key.num_local_participants) {
+          nccl_unique_id = InvalidArgument(
+              "Multihost AllReduce on GPU requires a nccl_unique_id_callback "
+              "to be provided by the client.");
+        } else {
+          nccl_unique_id = absl::optional<std::string>();
+        }
+      }
+      return absl::make_unique<NcclClique>(
+          participant.rendezvous_key.global_devices.size(),
+          std::move(local_device_ordinals), std::move(local_device_ranks),
+          nccl_unique_id);
+    };
+    clique =
+        GlobalNcclCliqueMap().GetOrCreateIfAbsent(clique_key, clique_factory);
 
     if (primary) {
       VLOG(3) << "Primary initializing accounting data.";
@@ -463,12 +506,12 @@ struct NcclAllReduceThunk::AuxData {
          crs->IsCrossReplicaAllReduce() && operands_are_supported();
 }
 
-/*static*/ absl::flat_hash_set<int>
+/*static*/ absl::flat_hash_set<GlobalDeviceId>
 NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
-  absl::flat_hash_set<int> devices;
+  absl::flat_hash_set<GlobalDeviceId> devices;
   GlobalNcclCliqueMap().ForEach(
       [&](const NcclCliqueKey& k, const std::shared_ptr<NcclClique>&) {
-        devices.insert(k.devices.begin(), k.devices.end());
+        devices.insert(k.devices().begin(), k.devices().end());
       });
   return devices;
 }
@@ -491,23 +534,65 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
       params.profiler->MakeScopedInstructionProfiler(hlo_instruction());
 
   auto* instr = Cast<HloAllReduceInstruction>(hlo_instruction());
-  int64 device_ordinal = params.stream->parent()->device_ordinal();
+  int64 local_device_ordinal = params.stream->parent()->device_ordinal();
+  GlobalDeviceId global_device_id;
+  if (params.gpu_global_device_ids) {
+    TF_RET_CHECK(0 <= local_device_ordinal &&
+                 local_device_ordinal < params.gpu_global_device_ids->size());
+    global_device_id = (*params.gpu_global_device_ids)[local_device_ordinal];
+  } else {
+    // No local -> global mapping was provided; assume the identity mapping.
+    global_device_id = GlobalDeviceId(local_device_ordinal);
+  }
 
+  // Determines the set of global and local devices that are participating in
+  // the same collective group as the caller.
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64> participating_replicas,
-      GetParticipatingReplicas(device_ordinal, instr->replica_groups(),
+      std::vector<int64> global_participating_replicas,
+      GetParticipatingReplicas(global_device_id, instr->replica_groups(),
                                replica_count_, *params.device_assn));
+  std::vector<GlobalDeviceId> global_devices;
+  std::vector<std::pair<GlobalDeviceId, int64>> local_devices;
+  local_devices.reserve(global_participating_replicas.size());
+  global_devices.reserve(global_participating_replicas.size());
+  TF_RET_CHECK(params.device_assn->computation_count() == 1)
+      << params.device_assn->ToString();
+  for (int64 replica : global_participating_replicas) {
+    GlobalDeviceId global_device(
+        (*params.device_assn)(replica, /*computation=*/0));
+    global_devices.push_back(global_device);
+    if (!params.gpu_global_device_ids) {
+      local_devices.emplace_back(global_device, global_device.value());
+    } else {
+      auto it = absl::c_find(*params.gpu_global_device_ids, global_device);
+      if (it != params.gpu_global_device_ids->end()) {
+        local_devices.emplace_back(
+            *it, std::distance(params.gpu_global_device_ids->begin(), it));
+      }
+    }
+  }
+  absl::c_sort(global_devices);
 
   // Find or create the rendezvous for this collective operation.
   RendezvousKey rendezvous_key = RendezvousKey::FromInstruction(
-      params.run_id, participating_replicas, hlo_instruction());
-
-  VLOG(2) << "Rendezvous key: " << rendezvous_key.ToString()
-          << ", participating replicas: "
-          << absl::StrJoin(participating_replicas, ", ");
+      params.run_id, global_devices, local_devices.size(), hlo_instruction());
 
+  if (VLOG_IS_ON(2)) {
+    std::vector<std::string> local_participants;
+    for (const auto& entry : local_devices) {
+      local_participants.push_back(absl::StrFormat(
+          "global=%d/local=%d", entry.first.value(), entry.second));
+    }
+    VLOG(2) << "Rendezvous key: " << rendezvous_key.ToString()
+            << ", global participating replicas: "
+            << absl::StrJoin(global_participating_replicas, ", ")
+            << ", global participating devices: "
+            << GlobalDeviceIdsToString(global_devices)
+            << ", local participants: "
+            << absl::StrJoin(local_participants, ",");
+  }
   AllReduceParticipantData participant(rendezvous_key);
-  participant.device_ordinal = device_ordinal;
+  participant.device_ordinal = local_device_ordinal;
   for (size_t i = 0; i < buffers_.size(); ++i) {
     const NcclAllReduceThunk::Buffer& buffer = buffers_[i];
     AllReduceParticipantData::Buffer pbuffer;
@@ -521,15 +606,24 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
     participant.buffers.push_back(pbuffer);
   }
   participant.stream = params.stream;
+  participant.local_devices = std::move(local_devices);
+  participant.nccl_unique_id_callback = params.nccl_unique_id_callback;
   auto reduction_kind =
       MatchReductionComputation(hlo_instruction()->to_apply());
   CHECK(reduction_kind.has_value());
   participant.reduction_kind = *reduction_kind;
 
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<NcclClique> clique,
-      RendezvousNcclAllReduce::SubmitParticipant(
-          [&] { return GlobalRendezvousMap()[rendezvous_key]; }, participant));
+  auto rendezvous_factory = [](const RendezvousKey& k) {
+    return absl::make_unique<RendezvousNcclAllReduce>(k);
+  };
+
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<NcclClique> clique,
+                      RendezvousNcclAllReduce::SubmitParticipant(
+                          [&] {
+                            return GlobalRendezvousMap().GetOrCreateIfAbsent(
+                                rendezvous_key, rendezvous_factory);
+                          },
+                          participant));
 
   // Keep the clique we used alive for as long as this Thunk lives.  Creating
   // new NCCL cliques is expensive, and this is how we avoid thrashing them.
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 7633a99794f..90091ed2c7b 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -46,7 +47,7 @@ class NcclAllReduceThunk : public Thunk {
   // (Indeed, because the NCCL channels are a global variable, in the real
   // world, the value returned here is stale as soon as you read it, so it's not
   // clear how you *could* use it for anything other than tests.)
-  static absl::flat_hash_set<int> DevicesWithOpenNcclChannels();
+  static absl::flat_hash_set<GlobalDeviceId> DevicesWithOpenNcclChannels();
 
   // TODO(b/125951860): Support all-reduces with replica groups, i.e.
   // all-reduces that compute multiple sums across subsets of all replicas.
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index abf829cee00..326c5a20716 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -98,6 +99,8 @@ class Thunk {
     HloExecutionProfiler* profiler;                               // never null
     const DeviceAssignment* device_assn;                          // never null
     std::vector<std::function<void()>>* deferred_host_callbacks;  // never null
+    const std::vector<GlobalDeviceId>* gpu_global_device_ids;     // may be null
+    const NcclUniqueIdCallback* nccl_unique_id_callback;          // may be null
   };
 
   // Execute the kernel for the thunk on the given stream. This method must be
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index c9afd6951e6..9ad07df8e9a 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 71
+// Next ID: 72
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -179,6 +179,10 @@ message HloInstructionProto {
   // Non-positive all_reduce_id is equivalent to no all_reduce_id.
   int64 all_reduce_id = 45 [deprecated = true];
 
+  // If true, interprets ids in ReplicaGroup as global device ids, which is
+  // a linearized id of `replica_id * partition_count + partition_id`.
+  bool use_global_device_ids = 71;
+
   // Whether this Send/Recv instruction transfers data to/from the host. Only
   // present for Send and Recv instructions and their SendDone and RecvDone
   // partners.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 8aeb92b40de..9698735b509 100755
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -407,7 +407,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()),
           /*constrain_layout=*/proto.constrain_layout(),
-          /*channel_id=*/channel_id);
+          /*channel_id=*/channel_id,
+          /*use_global_device_ids=*/proto.use_global_device_ids());
       break;
     }
     case HloOpcode::kAllToAll: {
@@ -930,10 +931,10 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
     const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
-    const absl::optional<int64>& channel_id) {
+    const absl::optional<int64>& channel_id, bool use_global_device_ids) {
   return absl::make_unique<HloAllReduceInstruction>(
       shape, operands, reduce_computation, replica_groups, constrain_layout,
-      channel_id);
+      channel_id, use_global_device_ids);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 33c0daca686..22220ccc2d5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -635,7 +635,7 @@ class HloInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
       const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
-      const absl::optional<int64>& channel_id);
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
 
   // An all-to-all op takes N array operands of the same shape and scatters them
   // to N replicas.  Each replica gathers the results into a tuple.
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1a30062a574..5905ecbdfb0 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -555,10 +555,11 @@ HloAllReduceInstruction::HloAllReduceInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
     const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
-    const absl::optional<int64>& channel_id)
+    const absl::optional<int64>& channel_id, bool use_global_device_ids)
     : HloCollectiveInstruction(HloOpcode::kAllReduce, shape, operands,
                                replica_groups, channel_id),
-      constrain_layout_(constrain_layout) {
+      constrain_layout_(constrain_layout),
+      use_global_device_ids_(use_global_device_ids) {
   AppendComputation(reduce_computation);
 }
 
@@ -574,6 +575,7 @@ bool HloAllReduceInstruction::IsNoop() const {
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   proto.set_constrain_layout(constrain_layout_);
+  proto.set_use_global_device_ids(use_global_device_ids_);
   return proto;
 }
 
@@ -584,6 +586,9 @@ std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
   if (constrain_layout_) {
     result.push_back("constrain_layout=true");
   }
+  if (use_global_device_ids_) {
+    result.push_back("use_global_device_ids=true");
+  }
   return result;
 }
 
@@ -594,6 +599,7 @@ bool HloAllReduceInstruction::IdenticalSlowPath(
   const auto& casted_other = static_cast<const HloAllReduceInstruction&>(other);
   return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
          constrain_layout() == casted_other.constrain_layout() &&
+         use_global_device_ids() == casted_other.use_global_device_ids() &&
          eq_computations(to_apply(), casted_other.to_apply());
 }
 
@@ -603,7 +609,7 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* /*context*/) const {
   return absl::make_unique<HloAllReduceInstruction>(
       shape, new_operands, to_apply(), replica_groups(), constrain_layout(),
-      channel_id());
+      channel_id(), use_global_device_ids());
 }
 
 HloAllToAllInstruction::HloAllToAllInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index f23453bc0be..8770e9be342 100755
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -338,7 +338,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
       const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
-      const absl::optional<int64>& channel_id);
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
 
   // Returns true if the AllReduce does no communication, so it's equivalent
   // to a mem copy.
@@ -359,6 +359,18 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   // unconstrained AllReduce instructions (checked by HloVerifier).
   bool constrain_layout() const { return constrain_layout_; }
 
+  // Returns true if the ids in the ReplicaGroup config represent a global id of
+  // (replica_id * partition_count + partition_id) instead of a replica id.
+  // This enables more flexible grouping of devices if this all-reduce is both
+  // cross-partition and cross-replica.
+  //
+  // For example with 2 replicas and 4 partitions,
+  // replica_groups={{0,1,4,5},{2,3,6,7}}, use_global_device_ids=true means that
+  // group[0] = (0,0), (0,1), (1,0), (1,1)
+  // group[1] = (0,2), (0,3), (1,2), (1,3)
+  // where each pair is (replica_id, partition_id).
+  bool use_global_device_ids() const { return use_global_device_ids_; }
+
  protected:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -376,6 +388,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
       HloCloneContext* context) const override;
 
   bool constrain_layout_;
+  bool use_global_device_ids_;
 };
 
 class HloAllToAllInstruction : public HloCollectiveInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 95a18c8daa7..f41ed233ed3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -855,6 +855,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<std::vector<int64>> replica_group_ids;
       optional<int64> channel_id;
       optional<bool> constrain_layout;
+      optional<bool> use_global_device_ids;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &to_apply};
       attrs["replica_groups"] = {/*required=*/false,
@@ -862,6 +863,8 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       attrs["channel_id"] = {/*required=*/false, AttrTy::kInt64, &channel_id};
       attrs["constrain_layout"] = {/*required=*/false, AttrTy::kBool,
                                    &constrain_layout};
+      attrs["use_global_device_ids"] = {/*required=*/false, AttrTy::kBool,
+                                        &use_global_device_ids};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
@@ -871,7 +874,8 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       }
       instruction = builder->AddInstruction(HloInstruction::CreateAllReduce(
           shape, operands, *to_apply, replica_groups,
-          constrain_layout ? *constrain_layout : false, channel_id));
+          constrain_layout ? *constrain_layout : false, channel_id,
+          use_global_device_ids ? *use_global_device_ids : false));
       break;
     }
     case HloOpcode::kAllToAll: {
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index e1ab7bb9646..79f5c0fd901 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -30,7 +30,7 @@ limitations under the License.
 namespace stream_executor {
 namespace interpreter {
 
-XlaInterpreterPlatform::XlaInterpreterPlatform(const string& name,
+XlaInterpreterPlatform::XlaInterpreterPlatform(const std::string& name,
                                                const Platform::Id& id)
     : name_(name), id_(id) {}
 
@@ -40,7 +40,7 @@ Platform::Id XlaInterpreterPlatform::id() const { return id_; }
 
 int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
-const string& XlaInterpreterPlatform::Name() const { return name_; }
+const std::string& XlaInterpreterPlatform::Name() const { return name_; }
 
 port::StatusOr<std::unique_ptr<DeviceDescription>>
 XlaInterpreterPlatform::DescriptionForDevice(int ordinal) const {
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index ff9c5d07f8d..da037bf17bc 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -31,14 +31,14 @@ class XlaInterpreterPlatform : public Platform {
  public:
   XlaInterpreterPlatform()
       : XlaInterpreterPlatform("Interpreter", kXlaInterpreterPlatformId) {}
-  XlaInterpreterPlatform(const string& name, const Platform::Id& id);
+  XlaInterpreterPlatform(const std::string& name, const Platform::Id& id);
   ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
 
   int VisibleDeviceCount() const override;
 
-  const string& Name() const override;
+  const std::string& Name() const override;
 
   port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
@@ -60,7 +60,7 @@ class XlaInterpreterPlatform : public Platform {
 
  private:
   // This platform's name.
-  string name_;
+  std::string name_;
   // This platform's id.
   Platform::Id id_;
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 91a00b5555a..ef8ddfc1a76 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -112,6 +112,10 @@ ExecutionOptions CreateExecutionOptions(
   }
   execution_options.set_num_replicas(build_options.num_replicas());
   execution_options.set_num_partitions(build_options.num_partitions());
+  if (build_options.has_device_assignment()) {
+    TF_CHECK_OK(build_options.device_assignment().Serialize(
+        execution_options.mutable_device_assignment()));
+  }
   execution_options.set_alias_passthrough_params(
       build_options.alias_passthrough_params());
   return execution_options;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
index aa28a36c945..b3e4002a898 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/AffineExpr.h"  // TF:llvm-project
 #include "mlir/IR/AffineMap.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
index bb67305c344..184d8d202c3 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.h"
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.cc b/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.cc
index 5d67d7dcf7f..bd64c18680c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/mlir_gpu/inject_errors_pass.h"
 
-#include "mlir/Dialect/StandardOps/Ops.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 
 namespace mlir {
 namespace {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 79e56e4a96f..ca26ae4e756 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // TF:llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // TF:llvm-project
 #include "mlir/Dialect/LoopOps/LoopOps.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -327,7 +327,7 @@ class LowerToNVVMPass
     ::mlir::gpu::GPUModuleOp m = getOperation();
 
     ::mlir::OwningRewritePatternList patterns;
-    ::mlir::LinalgTypeConverter converter(m.getContext());
+    ::mlir::LLVMTypeConverter converter(m.getContext());
     ::mlir::populateStdToLLVMConversionPatterns(converter, patterns);
     // TODO(b/145824979) Remove linalg once sliceop is in std.
     ::mlir::populateLinalgToLLVMConversionPatterns(converter, patterns,
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 13009992ab5..75c7c284881 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.h"
 
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index e383169399c..e471ba192e1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // TF:llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // TF:llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // TF:llvm-project
-#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 3a4e17d7f44..d58020655de 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -184,7 +184,7 @@ class Service : public ServiceInterface {
   Backend* mutable_backend() { return execute_backend_.get(); }
 
   // Create a Hlo module config for the given program shape and arguments.
-  // execution_options is optional; if not given a default is used.
+  // aot_options is optional; if not given a default is used.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       absl::Span<const Shape* const> argument_shapes,
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index 5cdf9633ca4..464865506f7 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -159,7 +159,7 @@ DeviceAssignment MakeDeviceAssn(std::vector<int64> devices) {
 }
 
 // Shorter alias for this function.
-absl::flat_hash_set<int> OpenNcclChannels() {
+absl::flat_hash_set<GlobalDeviceId> OpenNcclChannels() {
   return gpu::NcclAllReduceThunk::DevicesWithOpenNcclChannels();
 }
 
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 8f89c4655a3..c35f05ebf45 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -60,8 +60,8 @@ int main(int argc, char** argv) {
     LOG(FATAL) << "local_client_aot_test_helper TARGET_CPU";
   }
 
-  string triple_string;
-  string target_cpu = argv[1];
+  std::string triple_string;
+  std::string target_cpu = argv[1];
   if (target_cpu == "k8") {
     triple_string = "x86_64-none-linux-gnu";
   } else if (target_cpu == "darwin") {
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5002f80c059..4bae89cda01 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1841,6 +1841,32 @@ cc_library(
     ] + if_static([":lib_internal_impl"]),
 )
 
+# Until we can ditch config=monolithic on windows, we have to provide an always
+# headers only library for pybind rules to depend on.
+cc_library(
+    name = "lib_headers_for_pybind",
+    srcs = [":lib_internal_private_headers"],
+    hdrs = [":lib_internal_public_headers"],
+    copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            "-ldl",
+            "-lpthread",
+        ],
+    }),
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base:core_headers",
+        "//tensorflow/core/platform/default/build_config:platformlib",
+    ],
+)
+
 cc_library(
     name = "lib_internal_impl",
     srcs = [
diff --git a/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt
index 6b9beaebf84..1e9feef24a7 100644
--- a/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt
@@ -1,20 +1,4 @@
 op {
   graph_op_name: "Rsqrt"
-  endpoint {
-    name: "math.rsqrt"
-  }
-  endpoint {
-    name: "rsqrt"
-    deprecation_version: 2
-  }
-  description: <<END
-I.e., \\(y = 1 / \sqrt{x}\\).
-
-Example:
-
->>> x = tf.constant([2., 0., -2.])
->>> tf.math.rsqrt(x)
-<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.70710677,        inf,        nan], dtype=float32)>
-
-END
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index b17278fb365..c7583c374f2 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -45,7 +45,7 @@ StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
     }
     const auto& t = d->device_type();
     device_type_counts_[t]++;
-    if (cpu_device_ == nullptr && t == "CPU") {
+    if (cpu_device_ == nullptr && t == "CPU" && d->parsed_name().id == 0) {
       cpu_device_ = d.get();
     }
   }
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index f7e2e27e4ab..4bea08bb021 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -194,7 +194,8 @@ Device* DynamicDeviceMgr::HostCPU() const {
   }
   cpu_device_ = nullptr;
   for (const auto& pair : dynamic_devices_) {
-    if (pair.first->device_type() == DEVICE_CPU) {
+    if (pair.first->device_type() == DEVICE_CPU &&
+        pair.first->parsed_name().id == 0) {
       cpu_device_ = pair.first;
       break;
     }
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 7c7f1b3f498..5b2035edf43 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -167,6 +167,16 @@ std::vector<string> DevicesToString(const PrioritizedDeviceVector& devices) {
   return v;
 }
 
+std::vector<string> DeviceTypesToString(
+    const PrioritizedDeviceTypeVector& types) {
+  std::vector<string> v;
+  v.reserve(types.size());
+  for (const auto& p : types) {
+    v.push_back(p.first.type_string());
+  }
+  return v;
+}
+
 // Selects the "best" device that both exists and is supported.
 //
 // The `existing` argument specifies the available devices in the system, in
@@ -231,12 +241,18 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
   if (DeviceNameUtils::HasSomeDetails(preferred)) {
     return errors::InvalidArgument(
         "Could not satisfy device specification '", preferred,
-        "'. All available devices [",
+        "'. enable_soft_placement=", AllowSoftPlacement(),
+        ". Supported device types [",
+        absl::StrJoin(DeviceTypesToString(supported), ", "),
+        "]. All available devices [",
         absl::StrJoin(DevicesToString(existing), ", "), "].");
   }
   return errors::InvalidArgument(
       "No supported device found in available devices [",
-      absl::StrJoin(DevicesToString(existing), ", "), "].");
+      absl::StrJoin(DevicesToString(existing), ", "),
+      "]. enable_soft_placement=", AllowSoftPlacement(),
+      ". Supported devices types [",
+      absl::StrJoin(DeviceTypesToString(supported), ", "), "].");
 }
 
 void EagerContext::ResetClusterFLR(
@@ -654,34 +670,51 @@ Status EagerContext::RemoveFunction(const string& func) {
   return Status::OK();
 }
 
-Status EagerContext::ClearRemoteExecutors() {
+Status EagerContext::SyncExecutors() {
+  StatusGroup sg;
+  // Synchronize on context default executor
+  sg.Update(default_executor_.WaitForAllPendingNodes());
+  default_executor_.ClearError();
+
+  // Synchronize thread local executors on client
+  std::unordered_map<std::thread::id, EagerExecutor*> executors_copy;
+  {
+    mutex_lock l(executor_map_mu_);
+    executors_copy = thread_local_executor_;
+  }
+  for (const auto& entry : executors_copy) {
+    sg.Update(entry.second->WaitForAllPendingNodes());
+    entry.second->ClearError();
+  }
+
 #if !defined(IS_MOBILE_PLATFORM)
+  // Synchronize executors on remote workers
   eager::EnqueueRequest request;
   request.set_context_id(GetContextId());
-  request.add_queue()->mutable_clear_remote_executor_for_stream();
+  request.add_queue()->mutable_sync_remote_executor_for_stream();
   BlockingCounter counter(static_cast<int>(remote_contexts_.size()));
+  std::vector<Status> statuses(remote_contexts_.size());
 
-  for (const auto& target : remote_contexts_) {
+  for (int i = 0; i < remote_contexts_.size(); i++) {
+    const auto& target = remote_contexts_[i];
     core::RefCountPtr<eager::EagerClient> eager_client;
     TF_RETURN_IF_ERROR(remote_eager_workers_->GetClient(target, &eager_client));
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
-        &request, response, [response, target, &counter](const Status& status) {
-          if (!status.ok()) {
-            LOG(ERROR) << "Cleared remote executor on " << target
-                       << " with status: " << status.error_message();
-          }
+        &request, response,
+        [response, target, &counter, &s = statuses[i]](const Status& status) {
+          s = status;
           delete response;
           counter.DecrementCount();
         });
   }
-  // Currently we have to block since it appears that ops sent before the clear
-  // message returns can be cancelled unexpectedly.
-  // TODO(haoyuzhang): Remove the block.
   counter.Wait();
+  for (const Status& s : statuses) {
+    sg.Update(s);
+  }
 #endif  // !IS_MOBILE_PLATFORM
-  return Status::OK();
+  return sg.as_summary_status();
 }
 
 core::RefCountPtr<KernelAndDevice> EagerContext::GetCachedKernel(
@@ -835,12 +868,12 @@ Status EagerContext::GetClient(const string& remote_task,
   return Status::OK();
 }
 
-uint64 EagerContext::GetContextId() {
+uint64 EagerContext::GetContextId() const {
   tf_shared_lock l(remote_state_mu_);
   return context_id_;
 }
 
-uint64 EagerContext::GetContextViewId() {
+uint64 EagerContext::GetContextViewId() const {
   tf_shared_lock l(remote_state_mu_);
   return context_view_id_;
 }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 58a60f00393..b59b0653930 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -236,8 +236,12 @@ class EagerContext : public core::RefCounted {
 
   Status RemoveFunction(const string& func);
 
-  // Clear remote executors on all worker targets in `remote_contexts_`.
-  Status ClearRemoteExecutors();
+  // Wait for pending nodes to be finished in local executors (including context
+  // default executor and thread executors) and executors on remote workers.
+  // Return combined status of remote executors. If there are multiple errors,
+  // the Status code will be the same as the first remote executor that has
+  // errors, and the error message will be combined from all executors.
+  Status SyncExecutors();
 
   core::RefCountPtr<KernelAndDevice> GetCachedKernel(Fprint128 cache_key);
 
@@ -317,8 +321,8 @@ class EagerContext : public core::RefCounted {
   Status GetClient(const string& remote_task,
                    core::RefCountPtr<eager::EagerClient>* client);
 
-  uint64 GetContextId();
-  uint64 GetContextViewId();
+  uint64 GetContextId() const;
+  uint64 GetContextViewId() const;
   void IncrementContextViewId();
 
   // TODO(nareshmodi): Encapsulate remote state into a separate
@@ -601,7 +605,7 @@ class EagerContext : public core::RefCounted {
   std::shared_ptr<WorkerSession> worker_session_;
   std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
 
-  mutex remote_state_mu_;
+  mutable mutex remote_state_mu_;
 
   uint64 context_id_ GUARDED_BY(remote_state_mu_);
   // The view id of an eager context should be set to 0 when context is created,
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index b2979bd9dee..d49c9a5064b 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -19,8 +19,17 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
+namespace {
+bool IsAsyncWaitForRemoteFunctionEnabled() {
+  bool enabled = true;
+  TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_ASYNC_WAIT_FOR_REMOTE_FUNCTION",
+                                 true, &enabled));
+  return enabled;
+}
+}  // namespace
 
 EagerExecutor::EagerExecutor(bool async)
     : next_node_id_(0),
@@ -28,7 +37,10 @@ EagerExecutor::EagerExecutor(bool async)
       thread_(async ? tensorflow::Env::Default()->StartThread(
                           tensorflow::ThreadOptions(), "eager_async_executor",
                           std::bind(&EagerExecutor::Run, this))
-                    : nullptr) {}
+                    : nullptr),
+      last_eager_client_(nullptr),
+      enable_async_wait_for_remote_function_(
+          IsAsyncWaitForRemoteFunctionEnabled()) {}
 
 EagerExecutor::~EagerExecutor() {
   tensorflow::mutex_lock l(node_queue_mutex_);
@@ -194,6 +206,7 @@ void EagerExecutor::ClearError() {
   DCHECK(node_queue_.empty());
   status_ = tensorflow::Status::OK();
   ok_ = true;
+  last_eager_client_ = nullptr;
   nodes_pending_.notify_all();
 }
 
@@ -231,7 +244,7 @@ void EagerExecutor::NodeDone(const core::RefCountPtr<NodeItem>& item,
       DCHECK_GT(result, 0);
     }
 
-    if (!status.ok()) {
+    if (!status.ok() && item->node->Fatal()) {
       // Since we received an error, broadcast to any waiters.
       need_notification = true;
       status_ = status;
@@ -327,6 +340,33 @@ Status EagerExecutor::RunItem(core::RefCountPtr<NodeItem> item,
                               bool from_queue) {
   DVLOG(3) << "Running Node: [id " << item->id << "] "
            << item->node->DebugString();
+  AsyncRemoteExecuteNode* async_remote_node =
+      item->node->AsAsyncRemoteExecuteNode();
+  if (enable_async_wait_for_remote_function_) {
+    if (async_remote_node != nullptr) {
+      if (last_eager_client_ != nullptr &&
+          async_remote_node->eager_client() != nullptr &&
+          last_eager_client_ != async_remote_node->eager_client()) {
+        // Running a remote function, need to sync if the function is going to
+        // different device than last time we run remote distributed function.
+        DVLOG(3) << "Executing Sync Executor for node" << item->id;
+        tensorflow::Status status = async_remote_node->SyncExecutors();
+        if (!status.ok()) {
+          NodeDone(item, status, from_queue);
+          return status;
+        }
+        last_eager_client_ = nullptr;
+      }
+      if (async_remote_node->eager_client() != nullptr &&
+          async_remote_node->needs_remote_inputs() &&
+          async_remote_node->allow_multiple_pending_requests()) {
+        // We are running remote distributed function, update
+        // last_remote_device_name_.
+        last_eager_client_ = async_remote_node->eager_client();
+      }
+    }
+  }
+
   AsyncEagerNode* async_node = item->node->AsAsync();
   if (async_node == nullptr) {
     tensorflow::Status status = item->node->Run();
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index b9fd0122faf..375e9a6e6a7 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -39,6 +39,10 @@ limitations under the License.
 namespace tensorflow {
 
 class AsyncEagerNode;
+class AsyncRemoteExecuteNode;
+namespace eager {
+class EagerClient;
+}
 
 // A unit of execution for the EagerExecutor class below. Example subclasses
 // encapsulate execution of a TFE_Op, or copying a TFE_TensorHandle from one
@@ -65,8 +69,12 @@ class EagerNode {
 
   // Returns nullptr iff this Eager node is synchronous.
   virtual AsyncEagerNode* AsAsync() { return nullptr; }
+  virtual AsyncRemoteExecuteNode* AsAsyncRemoteExecuteNode() { return nullptr; }
 
   virtual string DebugString() const = 0;
+
+  // Indicates whether a node failure should make the executor unusable.
+  virtual bool Fatal() const { return true; }
 };
 
 class AsyncEagerNode : public EagerNode {
@@ -83,6 +91,16 @@ class AsyncEagerNode : public EagerNode {
   }
 };
 
+class AsyncRemoteExecuteNode : public AsyncEagerNode {
+ public:
+  AsyncRemoteExecuteNode* AsAsyncRemoteExecuteNode() final { return this; }
+
+  virtual const eager::EagerClient* eager_client() const = 0;
+  virtual bool needs_remote_inputs() const = 0;
+  virtual bool allow_multiple_pending_requests() const = 0;
+  virtual Status SyncExecutors() = 0;
+};
+
 // A class for handling async execution (see TFE_ContextSetAsync).
 // Note that this class is thread-safe.
 // TODO(agarwal): TFE_OpAddInput may currently block if it tries to access the
@@ -225,6 +243,11 @@ class EagerExecutor {
   // Thread object that calls the `Run` method in async mode.This thread runs
   // until state_ is set to kShuttingDown. It is `nullptr` in sync mode.
   const std::unique_ptr<Thread> thread_;
+
+  // Last device where remote function with remote inputs was executed.
+  const eager::EagerClient* last_eager_client_;
+
+  const bool enable_async_wait_for_remote_function_;
 };
 
 inline bool EagerExecutor::Async() const { return thread_ != nullptr; }
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index bc1bf9c1610..f913f5e4b86 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -121,6 +121,11 @@ Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
       }
       TF_FALLTHROUGH_INTENDED;
     case DEVICE_PLACEMENT_EXPLICIT:
+      // tf.identity is allowed to copy, as indicated in the error message
+      // below.
+      if (op->Name() == "Identity" || op->Name() == "IdentityN") {
+        break;
+      }
       return errors::InvalidArgument(
           "Tensors on conflicting devices:"
           " cannot compute ",
@@ -716,7 +721,8 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         } else {
           serialize_resource_dtype_and_shape =
               (input->dtype == DT_RESOURCE) &&
-              (!input->HasResourceShapeMirror(op_device));
+              (!input->HasResourceShapeMirror(op_device,
+                                              ctx.GetContextViewId()));
         }
       }
       auto* input_handle = remote_op->add_inputs();
@@ -727,7 +733,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         auto tensor_handle_data =
             absl::make_unique<UnshapedRemoteTensorHandleData>(
                 input_handle->op_id(), input_handle->output_num(), remote_task,
-                context_id, &ctx);
+                &ctx);
         TF_RETURN_IF_ERROR(input->AddResourceShapeMirror(
             std::move(tensor_handle_data), op_device));
       }
@@ -760,8 +766,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     // to copy this tensor to this process, the remote end will know the
     // correct device of this handle.
     Status status = TensorHandle::CreateUnshapedRemoteHandle(
-        id, i, remote_task, context_id, output_dtypes[i], op_device, &ctx,
-        &retvals[i]);
+        id, i, remote_task, output_dtypes[i], op_device, &ctx, &retvals[i]);
     if (!status.ok()) {
       for (int j = 0; j < i; ++j) {
         retvals[j]->Poison(errors::Internal(
@@ -791,7 +796,8 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
            << " (is async?: " << executor.Async() << ").";
 
   std::unique_ptr<EagerNode> node(new eager::RemoteExecuteNode(
-      std::move(request), op_device, eager_client.get(),
+      &op->EagerContext(), std::move(request), op_device,
+      ctx.GetContextViewId(), eager_client.get(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
   Status s = executor.AddOrExecute(std::move(node));
@@ -1052,7 +1058,10 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
     return Status::OK();
   }
 
-  if (mirror) {
+  // TODO(gjn): Need to add support for async execution. Note if receiver
+  // is local, we need to first add support in TensorHandle to wait on local
+  // mirrors.
+  if (mirror && !executor->Async()) {
     TF_RETURN_IF_ERROR(h->AddEmptyLocalMirror(d));
     h->Ref();
     *result = h;
@@ -1086,7 +1095,7 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
   }
   bool sender_is_local = absl::get<Device*>(send_device)->IsLocal();
 
-  bool recver_is_local = device->IsLocal();
+  bool receiver_is_local = device->IsLocal();
 
   if (!executor->Async()) {
     // In sync mode, always clear error to maintain the same behavior as before.
@@ -1094,27 +1103,42 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
     executor->ClearError();
   }
 
-  if (sender_is_local && recver_is_local) {
+  if (sender_is_local && receiver_is_local) {
     return LocalEagerCopyToDevice(h, ctx, executor, device, mirror, result);
   } else {
 #if defined(IS_MOBILE_PLATFORM)
     return errors::Unimplemented(
         "Eager's remote execution is not available on mobile devices.");
 #else   // !IS_MOBILE_PLATFORM
-    if (mirror) {
-      if (h->HasRemoteMirror(device)) {
+    uint64 recv_op_id = 0;
+    if (receiver_is_local) {
+      Device* d = ctx->CanonicalDevice(device);
+      if (mirror && h->HasLocalMirror(d)) {
         h->Ref();
         *result = h;
         return Status::OK();
       }
-    }
-    uint64 recv_op_id = 0;
-    if (recver_is_local) {
-      TF_RETURN_IF_ERROR(TensorHandle::CreateEmptyLocalHandle(
-          true, /* d= */ ctx->CanonicalDevice(device), /* op_device= */ device,
-          /*resource_device=*/nullptr, h->dtype, ctx, result));
+
+      // TODO(gjn): Need to add support for async execution. Note if receiver
+      // is local, we need to first add support in TensorHandle to wait on local
+      // mirrors.
+      if (mirror && !executor->Async()) {
+        TF_RETURN_IF_ERROR(h->AddEmptyLocalMirror(d));
+        h->Ref();
+        *result = h;
+      } else {
+        TF_RETURN_IF_ERROR(TensorHandle::CreateEmptyLocalHandle(
+            true, /* d= */ d, /* op_device= */ device,
+            /*resource_device=*/nullptr, h->dtype, ctx, result));
+      }
     } else {
-      uint64 context_id = ctx->GetContextId();
+      if (mirror) {
+        if (h->HasRemoteMirror(device, ctx->GetContextViewId())) {
+          h->Ref();
+          *result = h;
+          return Status::OK();
+        }
+      }
       string remote_task;
       if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
         return errors::InvalidArgument(
@@ -1123,8 +1147,8 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
       }
       recv_op_id = ctx->RemoteMgr()->NextOpId();
       auto tensor_handle_data =
-          absl::make_unique<UnshapedRemoteTensorHandleData>(
-              recv_op_id, 0, remote_task, context_id, ctx);
+          absl::make_unique<UnshapedRemoteTensorHandleData>(recv_op_id, 0,
+                                                            remote_task, ctx);
       if (mirror) {
         TF_RETURN_IF_ERROR(
             h->AddUnshapedRemoteMirror(std::move(tensor_handle_data), device));
@@ -1135,6 +1159,7 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
             std::move(tensor_handle_data), h->dtype, device, ctx, result));
       }
     }
+
     auto node = absl::make_unique<eager::RemoteCopyNode>(
         ctx, executor, h, result[0], device, recv_op_id);
     Status s = executor->AddOrExecute(std::move(node));
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index c053420fe83..d523dc20084 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -27,28 +27,29 @@ Status ExecuteNodeArgs::Init(
   // be decremented once execution is complete.
   const int n_inputs = op_inputs.size();
   if (n_inputs > 0) {
-    TensorHandle* const* op_inputs_array = &op_inputs[0];
-    TensorValue* tensor_args_array = &tensor_args_[0];
+    TensorHandle* const* op_inputs_flat = &op_inputs[0];
+    TensorValue* tensor_args_flat = &tensor_args_[0];
     for (int i = 0; i < n_inputs; ++i) {
-      TensorHandle* in = op_inputs_array[i];
-      if (!in->IsRemote()) {
-        TF_RETURN_IF_ERROR(
-            in->TensorValue(&tensor_args_array[i],
-                            ctx->CanonicalDevice(kernel->InputDevice(i))));
-      } else {
-        if (!has_remote_inputs_) {
-          has_remote_inputs_ = true;
+      TensorHandle* in = op_inputs_flat[i];
+      Device* d = kernel->InputDevice(i);
+      Status s = in->TensorValue(&tensor_args_flat[i], ctx->CanonicalDevice(d));
+      if (!s.ok()) {
+#if !defined(IS_MOBILE_PLATFORM)
+        uint64 context_view_id = ctx->GetContextViewId();
+        if (in->IsRemote() || in->HasRemoteMirror(d, context_view_id)) {
+          if (!has_remote_inputs_) {
+            has_remote_inputs_ = true;
+          }
+          continue;
         }
+#endif
+        return s;
       }
     }
   }
 
+#if !defined(IS_MOBILE_PLATFORM)
   if (has_remote_inputs_) {
-#if defined(IS_MOBILE_PLATFORM)
-    return errors::Unimplemented(
-        "Eager's function execution with remote inputs is not available on "
-        "mobile devices.");
-#else   // !IS_MOBILE_PLATFORM
     serialize_remote_handle_ =
         [ctx, &op_inputs](const int i,
                           eager::RemoteTensorHandle* handle) -> Status {
@@ -63,8 +64,8 @@ Status ExecuteNodeArgs::Init(
       return ctx->RemoteMgr()->SerializeRemoteTensorHandle(
           op_inputs[i], handle, device, device->name());
     };
-#endif  // !IS_MOBILE_PLATFORM
   }
+#endif  // !IS_MOBILE_PLATFORM
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index a222724ec8f..346ccc11ca8 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -135,11 +135,13 @@ Status MklEagerOpRewrite::SetupNewOp(
       ->MutableAttrs()
       ->Set("_kernel", mkl_op_registry::kMklNameChangeOpLabel);
 
-  if (orig_op->Device() != nullptr) {
-    (*new_mkl_op)->SetDevice(orig_op->Device());
-  } else {
+  if (orig_op->Device() == kVariantDeviceNull) {
     string device_name = orig_op->GetDeviceName();
     (*new_mkl_op)->SetDeviceName(device_name.c_str());
+  } else if (VariantDeviceIsCustom(orig_op->Device())) {
+    (*new_mkl_op)->SetDevice(absl::get<CustomDevice*>(orig_op->Device()));
+  } else {
+    (*new_mkl_op)->SetDevice(absl::get<Device*>(orig_op->Device()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index ef2b3104ed8..4dbf5d6313d 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -239,13 +239,12 @@ Status TensorHandle::CreateRemoteHandle(
 Status TensorHandle::CreateRemoteHandle(int64 op_id, int output_num,
                                         const TensorShape& shape,
                                         const string& remote_task,
-                                        uint64 context_id, DataType dtype,
-                                        Device* d, Device* resource_device,
+                                        DataType dtype, Device* d,
+                                        Device* resource_device,
                                         EagerContext* ctx, TensorHandle** h) {
-  *h = new TensorHandle(
-      absl::make_unique<RemoteTensorHandleData>(op_id, output_num, shape,
-                                                remote_task, context_id, ctx),
-      dtype, d, resource_device, ctx);
+  *h = new TensorHandle(absl::make_unique<RemoteTensorHandleData>(
+                            op_id, output_num, shape, remote_task, ctx),
+                        dtype, d, resource_device, ctx);
   return Status::OK();
 }
 
@@ -276,11 +275,13 @@ Status TensorHandle::CreateUnshapedRemoteHandle(
   return Status::OK();
 }
 
-Status TensorHandle::CreateUnshapedRemoteHandle(
-    int64 op_id, int32 output_num, const string& remote_task, uint64 context_id,
-    DataType dtype, Device* device, EagerContext* ctx, TensorHandle** h) {
+Status TensorHandle::CreateUnshapedRemoteHandle(int64 op_id, int32 output_num,
+                                                const string& remote_task,
+                                                DataType dtype, Device* device,
+                                                EagerContext* ctx,
+                                                TensorHandle** h) {
   *h = new TensorHandle(absl::make_unique<UnshapedRemoteTensorHandleData>(
-                            op_id, output_num, remote_task, context_id, ctx),
+                            op_id, output_num, remote_task, ctx),
                         dtype, device, ctx);
   return Status::OK();
 }
@@ -293,8 +294,6 @@ TensorHandle::TensorHandle(std::unique_ptr<UnshapedRemoteTensorHandleData> t,
       resource_device_(dtype == DT_RESOURCE ? device : nullptr),
       remote_op_id_(t->op_id()),
       remote_output_num_(t->output_num()),
-      remote_task_(t->remote_task()),
-      remote_context_id_(t->context_id()),
       ctx_(ctx),
       is_remote_(true),
       is_async_(true),
@@ -320,8 +319,10 @@ Status TensorHandle::WaitReady(const char* caller) const {
   if (!IsReady()) {
     profiler::TraceMe activity(absl::StrCat(caller, " WaitReady"),
                                profiler::TraceMeLevel::kInfo);
+    DVLOG(3) << "Waiting on TensorHandle " << this;
     tf_shared_lock l(mu_);
     mu_.Await(Condition(&is_ready_));
+    DVLOG(3) << "TensorHandle ready: " << this;
   }
   return is_poisoned_;
 }
@@ -333,9 +334,12 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) const {
 
 Status TensorHandle::TensorFromDevice(const Device* d,
                                       const tensorflow::Tensor** t) const {
-  TF_RETURN_IF_ERROR(WaitReady("TensorHandle::TensorFromDevice"));
-
   if (d == absl::get<Device*>(device_)) {
+    if (is_remote_) {
+      return errors::Internal("Invalid Tensor call on remote handle: ", this);
+    }
+
+    TF_RETURN_IF_ERROR(WaitReady("TensorHandle::TensorFromDevice"));
     return tensor_handle_data_->Tensor(t);
   }
 
@@ -347,6 +351,7 @@ Status TensorHandle::TensorFromDevice(const Device* d,
 
   auto empty_mirror = empty_local_mirrors_.find(d);
   if (empty_mirror != empty_local_mirrors_.end()) {
+    // TODO(gjn): Add support for waiting on local mirrors
     return errors::Internal("Attempted to get Tensor for empty mirror");
   }
 
@@ -355,9 +360,13 @@ Status TensorHandle::TensorFromDevice(const Device* d,
 }
 
 Status TensorHandle::TensorValue(tensorflow::TensorValue* t, const Device* d) {
-  TF_RETURN_IF_ERROR(WaitReady("TensorHandle::TensorValue"));
-
   if (d == absl::get<Device*>(device_)) {
+    if (is_remote_) {
+      return errors::Internal("Invalid TensorValue call on remote handle: ",
+                              this);
+    }
+
+    TF_RETURN_IF_ERROR(WaitReady("TensorHandle::TensorValue"));
     return tensor_handle_data_->TensorValue(t);
   }
 
@@ -369,6 +378,7 @@ Status TensorHandle::TensorValue(tensorflow::TensorValue* t, const Device* d) {
 
   auto empty_mirror = empty_local_mirrors_.find(d);
   if (empty_mirror != empty_local_mirrors_.end()) {
+    // TODO(gjn): Add support for waiting on local mirrors
     return errors::Internal("Attempted to get TensorValue for empty mirror");
   }
 
@@ -515,7 +525,7 @@ Status TensorHandle::Unprotect(const Device* d) {
                           " in Unprotect call to handle: ", this);
 }
 
-bool TensorHandle::HasLocalMirror(Device* d) {
+bool TensorHandle::HasLocalMirror(const Device* d) const {
   mutex_lock l(mu_);
   auto mirror = local_mirrors_.find(d);
   if (mirror != local_mirrors_.end()) {
@@ -530,7 +540,10 @@ bool TensorHandle::HasLocalMirror(Device* d) {
   return false;
 }
 
-Status TensorHandle::AddEmptyLocalMirror(Device* d) {
+Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
+  DVLOG(3) << "AddEmptyLocalMirror on TensorHandle: " << this
+           << " device: " << d;
+
   mutex_lock l(mu_);
   if (local_mirrors_.find(d) != local_mirrors_.end()) {
     return errors::Internal("Attempted to duplicate a local mirror.");
@@ -545,18 +558,18 @@ Status TensorHandle::AddEmptyLocalMirror(Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-Status TensorHandle::RemoteAddress(Device* d, int64* op_id,
+Status TensorHandle::RemoteAddress(const Device* d, int64* op_id,
                                    int32* output_num) const {
   if (VariantDeviceIsCustom(device_) || d != absl::get<Device*>(device_)) {
     tf_shared_lock l(mu_);
-    auto mirror = remote_mirrors_.find(d);
+    auto mirror = remote_mirrors_.find(d->name());
     if (mirror != remote_mirrors_.end()) {
       *op_id = mirror->second->op_id();
       *output_num = mirror->second->output_num();
       return Status::OK();
     }
 
-    auto unshaped_mirror = unshaped_remote_mirrors_.find(d);
+    auto unshaped_mirror = unshaped_remote_mirrors_.find(d->name());
     if (unshaped_mirror != unshaped_remote_mirrors_.end()) {
       *op_id = unshaped_mirror->second->op_id();
       *output_num = unshaped_mirror->second->output_num();
@@ -578,97 +591,149 @@ Status TensorHandle::RemoteAddress(Device* d, int64* op_id,
   return Status::OK();
 }
 
-void TensorHandle::SetRemoteOpIdAndOutputNumToLocalTensorHandle(
-    const int64 op_id, const int32 output_num) {
-  DCHECK(!is_remote_);
-  remote_op_id_ = op_id;
-  remote_output_num_ = output_num;
-}
+bool TensorHandle::HasRemoteMirror(const Device* d,
+                                   uint64 context_view_id) const {
+  DVLOG(3) << "HasRemoteMirror on TensorHandle: " << this;
 
-bool TensorHandle::HasRemoteMirror(Device* d) {
   tf_shared_lock l(mu_);
-  auto mirror = remote_mirrors_.find(d);
+  auto mirror = remote_mirrors_.find(d->name());
   if (mirror != remote_mirrors_.end()) {
+    // Check if mirror is stale
+    if (mirror->second->context_view_id() != context_view_id) {
+      return false;
+    }
     return true;
   }
 
-  auto unshaped_mirror = unshaped_remote_mirrors_.find(d);
+  auto unshaped_mirror = unshaped_remote_mirrors_.find(d->name());
   if (unshaped_mirror != unshaped_remote_mirrors_.end()) {
+    // Check if mirror is stale
+    if (unshaped_mirror->second->context_view_id() != context_view_id) {
+      return false;
+    }
     return true;
   }
 
   return false;
 }
 
-bool TensorHandle::HasResourceShapeMirror(Device* d) {
+bool TensorHandle::HasResourceShapeMirror(const Device* d,
+                                          uint64 context_view_id) const {
+  DVLOG(3) << "HasResourceShapeMirror on TensorHandle: " << this;
+
   tf_shared_lock l(mu_);
-  auto mirror = resource_shape_mirrors_.find(d);
+  auto mirror = resource_shape_mirrors_.find(d->name());
   if (mirror != resource_shape_mirrors_.end()) {
+    // Check if mirror is stale
+    if (mirror->second->context_view_id() != context_view_id) {
+      return false;
+    }
     return true;
   }
   return false;
 }
 
 Status TensorHandle::AddUnshapedRemoteMirror(
-    std::unique_ptr<UnshapedRemoteTensorHandleData> t, Device* d) {
+    std::unique_ptr<UnshapedRemoteTensorHandleData> t, const Device* d) {
+  DVLOG(3) << "AddUnshapedRemoteMirror on TensorHandle: " << this;
+
   mutex_lock l(mu_);
-  if (remote_mirrors_.find(d) != remote_mirrors_.end()) {
-    return errors::Internal("Attempted to duplicate a remote mirror.");
+  auto remote_mirror = remote_mirrors_.find(d->name());
+  if (remote_mirror != remote_mirrors_.end()) {
+    if (remote_mirror->second->context_view_id() == t->context_view_id()) {
+      return errors::Internal("Attempted to duplicate a remote mirror.");
+    }
+    // Remove stale mirror
+    remote_mirrors_.erase(remote_mirror);
   }
 
-  auto ret = unshaped_remote_mirrors_.insert(std::make_pair(d, std::move(t)));
-  if (!ret.second) {
-    return errors::Internal(
-        "Attempted to duplicate an unshaped remote mirror.");
+  auto unshaped_remote_mirror = unshaped_remote_mirrors_.find(d->name());
+  if (unshaped_remote_mirror != unshaped_remote_mirrors_.end()) {
+    if (unshaped_remote_mirror->second->context_view_id() ==
+        t->context_view_id()) {
+      return errors::Internal(
+          "Attempted to duplicate an unshaped remote mirror.");
+    }
+    // Remove stale mirror
+    unshaped_remote_mirrors_.erase(unshaped_remote_mirror);
   }
 
+  unshaped_remote_mirrors_[d->name()] = std::move(t);
+
   return Status::OK();
 }
 
 Status TensorHandle::AddResourceShapeMirror(
-    std::unique_ptr<UnshapedRemoteTensorHandleData> t, Device* d) {
+    std::unique_ptr<UnshapedRemoteTensorHandleData> t, const Device* d) {
+  DVLOG(3) << "AddResourceShapeMirror on TensorHandle: " << this;
+
   mutex_lock l(mu_);
-  auto ret = resource_shape_mirrors_.insert(std::make_pair(d, std::move(t)));
-  if (!ret.second) {
-    return errors::Internal("Attempted to duplicate a resource shape mirror.");
+  auto mirror = resource_shape_mirrors_.find(d->name());
+  if (mirror != resource_shape_mirrors_.end()) {
+    if (mirror->second->context_view_id() == t->context_view_id()) {
+      return errors::Internal(
+          "Attempted to duplicate a resource shape mirror.");
+    }
+    // Remove stale mirror
+    resource_shape_mirrors_.erase(mirror);
   }
 
+  resource_shape_mirrors_[d->name()] = std::move(t);
+
   return Status::OK();
 }
 
 Status TensorHandle::AddRemoteMirror(std::unique_ptr<RemoteTensorHandleData> t,
-                                     Device* d) {
+                                     const Device* d) {
+  DVLOG(3) << "AddRemoteMirror on TensorHandle: " << this << " device: " << d;
+
   mutex_lock l(mu_);
-  auto ret = remote_mirrors_.insert(std::make_pair(d, std::move(t)));
-  if (!ret.second) {
-    return errors::Internal("Attempted to duplicate a remote mirror.");
+  auto mirror = remote_mirrors_.find(d->name());
+  if (mirror != remote_mirrors_.end()) {
+    if (mirror->second->context_view_id() == t->context_view_id()) {
+      return errors::Internal("Attempted to duplicate a remote mirror.");
+    }
+    // Remove stale mirror
+    remote_mirrors_.erase(mirror);
   }
 
+  remote_mirrors_[d->name()] = std::move(t);
+
   return Status::OK();
 }
 
-Status TensorHandle::SetRemoteShape(const TensorShape& shape,
-                                    tensorflow::Device* d) {
+Status TensorHandle::SetRemoteShape(const TensorShape& shape, const Device* d,
+                                    uint64 context_view_id) {
   DVLOG(3) << "SetRemoteShape on TensorHandle: " << this << " device: " << d;
 
   if (VariantDeviceIsCustom(device_) || d != absl::get<Device*>(device_)) {
     mutex_lock l(mu_);
-    if (remote_mirrors_.find(d) != remote_mirrors_.end()) {
-      return errors::Internal(
-          "Attempted to set remote shape for existing mirror.");
+    auto remote_mirror = remote_mirrors_.find(d->name());
+    if (remote_mirror != remote_mirrors_.end()) {
+      if (remote_mirror->second->context_view_id() == context_view_id) {
+        return errors::Internal(
+            "Attempted to set remote shape for existing mirror.");
+      }
+      remote_mirrors_.erase(remote_mirror);
     }
 
-    auto elem = unshaped_remote_mirrors_.find(d);
+    auto elem = unshaped_remote_mirrors_.find(d->name());
     if (elem == unshaped_remote_mirrors_.end()) {
       return errors::Internal(
           "Attempted to set remote shape for non-waiting mirror.");
     }
 
+    if (elem->second->context_view_id() != context_view_id) {
+      unshaped_remote_mirrors_.erase(elem);
+      return errors::Internal(
+          "Attempted to set remote shape for a stale mirror.");
+    }
+
     auto& data = elem->second;
     data->ReleaseRemoteTensorHandle();
-    remote_mirrors_[d] = absl::make_unique<RemoteTensorHandleData>(
+    remote_mirrors_[d->name()] = absl::make_unique<RemoteTensorHandleData>(
         data->op_id(), data->output_num(), shape, data->remote_task(),
-        data->context_id(), data->ctx());
+        &data->ctx());
     unshaped_remote_mirrors_.erase(elem);
 
     return Status::OK();
@@ -680,10 +745,13 @@ Status TensorHandle::SetRemoteShape(const TensorShape& shape,
   UnshapedRemoteTensorHandleData* p =
       reinterpret_cast<UnshapedRemoteTensorHandleData*>(
           tensor_handle_data_.get());
+  if (p->context_view_id() != context_view_id) {
+    return errors::Internal("Attempted to set remote shape for an old handle.");
+  }
+
   p->ReleaseRemoteTensorHandle();
   tensor_handle_data_ = absl::make_unique<RemoteTensorHandleData>(
-      remote_op_id_, remote_output_num_, shape, remote_task_,
-      remote_context_id_, ctx_);
+      remote_op_id_, remote_output_num_, shape, p->remote_task(), ctx_);
   is_poisoned_ = Status::OK();
   mutex_lock l(mu_);
   is_ready_ = true;
@@ -836,7 +904,7 @@ Device* GetResourceDevice(const ResourceHandle& handle, EagerContext* ctx) {
 }
 
 string TensorHandle::DebugString() const {
-  DVLOG(1) << "Calling TensorHandle::DebugString() on " << this;
+  DVLOG(4) << "Calling TensorHandle::DebugString() on " << this;
 
   string out;
   string device_debug = VariantDeviceDebugString(device_);
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index bae03a96f33..817347adc92 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -103,19 +103,17 @@ class TensorHandle : public core::RefCounted {
 #if !defined(IS_MOBILE_PLATFORM)
   static Status CreateRemoteHandle(int64 op_id, int output_num,
                                    const TensorShape& shape,
-                                   const string& remote_task, uint64 context_id,
-                                   DataType dtype, Device* d,
-                                   Device* resource_device, EagerContext* ctx,
-                                   TensorHandle** h);
+                                   const string& remote_task, DataType dtype,
+                                   Device* d, Device* resource_device,
+                                   EagerContext* ctx, TensorHandle** h);
   static Status CreateRemoteHandle(std::unique_ptr<RemoteTensorHandleData> t,
                                    DataType dtype, Device* d,
                                    Device* resource_device, EagerContext* ctx,
                                    TensorHandle** h);
   static Status CreateUnshapedRemoteHandle(int64 op_id, int32 output_num,
                                            const string& remote_task,
-                                           uint64 context_id, DataType dtype,
-                                           Device* device, EagerContext* ctx,
-                                           TensorHandle** h);
+                                           DataType dtype, Device* device,
+                                           EagerContext* ctx, TensorHandle** h);
   static Status CreateUnshapedRemoteHandle(
       std::unique_ptr<UnshapedRemoteTensorHandleData> t, DataType dtype,
       Device* device, EagerContext* ctx, TensorHandle** h);
@@ -151,28 +149,24 @@ class TensorHandle : public core::RefCounted {
   // Checks if a mirror tensor exists for the specified device. Mirrors are only
   // maintained for local devices, like CPUs & GPUs. Note a mirror may be empty,
   // as it is still to be set by an async operation.
-  bool HasLocalMirror(Device* d);
+  bool HasLocalMirror(const Device* d) const;
   // Add an empty mirror placeholder for the specified device. The expectation
   // is this will be populated by a call to SetTensor.
-  Status AddEmptyLocalMirror(Device* d);
+  Status AddEmptyLocalMirror(const Device* d);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  bool HasRemoteMirror(Device* d);
-  bool HasResourceShapeMirror(Device* d);
+  bool HasRemoteMirror(const Device* d, uint64 context_view_id) const;
+  bool HasResourceShapeMirror(const Device* d, uint64 context_view_id) const;
 
   Status AddUnshapedRemoteMirror(
-      std::unique_ptr<UnshapedRemoteTensorHandleData> t, Device* d);
-  Status AddRemoteMirror(std::unique_ptr<RemoteTensorHandleData> t, Device* d);
+      std::unique_ptr<UnshapedRemoteTensorHandleData> t, const Device* d);
+  Status AddRemoteMirror(std::unique_ptr<RemoteTensorHandleData> t,
+                         const Device* d);
   Status AddResourceShapeMirror(
-      std::unique_ptr<UnshapedRemoteTensorHandleData> t, Device* d);
+      std::unique_ptr<UnshapedRemoteTensorHandleData> t, const Device* d);
 
   // Return the op_id and output num if the handle refers to a remote tensor.
-  Status RemoteAddress(Device* d, int64* op_id, int32* output_num) const;
-
-  // Set remote_op_id_ and remote_output_num_ if the handle refers to a local
-  // tensor that needs to be copied to remote workers.
-  void SetRemoteOpIdAndOutputNumToLocalTensorHandle(const int64 op_id,
-                                                    const int32 output_num);
+  Status RemoteAddress(const Device* d, int64* op_id, int32* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
@@ -180,7 +174,8 @@ class TensorHandle : public core::RefCounted {
   // queried.
   // This method or Poison must be called exactly once for remote tensors that
   // were created without a known shape.
-  Status SetRemoteShape(const TensorShape& shape, tensorflow::Device* d);
+  Status SetRemoteShape(const TensorShape& shape, const Device* d,
+                        uint64 context_view_id);
 #endif
 
   // Sets the `tensor` for this async non-ready handle making it ready.
@@ -277,23 +272,21 @@ class TensorHandle : public core::RefCounted {
   // TODO(yujingzhang): Remove resource_shape_mirrors_ once scalable per-replica
   // variable is ready, since we could get the shape locally without remote copy
   // then.
-  std::map<tensorflow::Device*, std::unique_ptr<UnshapedRemoteTensorHandleData>>
+  std::map<string, std::unique_ptr<UnshapedRemoteTensorHandleData>>
       resource_shape_mirrors_ GUARDED_BY(mu_);
-  // TODO(gjn): Unshaped remote mirrors are long expected to be long-lived.
+  // TODO(gjn): Unshaped remote mirrors are not expected to be long-lived.
   // Consider replacing the unshaped_remote_mirrors_ map with something more
   // efficient.
-  std::map<tensorflow::Device*, std::unique_ptr<UnshapedRemoteTensorHandleData>>
+  std::map<string, std::unique_ptr<UnshapedRemoteTensorHandleData>>
       unshaped_remote_mirrors_ GUARDED_BY(mu_);
   // TODO(gjn): Is std::map the most optimal choice here? Perhaps this should be
   // a fixed size map.
-  std::map<tensorflow::Device*, std::unique_ptr<RemoteTensorHandleData>>
-      remote_mirrors_ GUARDED_BY(mu_);
+  std::map<string, std::unique_ptr<RemoteTensorHandleData>> remote_mirrors_
+      GUARDED_BY(mu_);
 
   // IDs required when this class is representing a remote tensor handle.
   int64 remote_op_id_;
   int32 remote_output_num_;
-  string remote_task_;
-  uint64 remote_context_id_;
 #endif
 
   // `ctx` is only guaranteed to be set if the handle is not "ready". This is
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index bc1670b9f71..2f4f7b91280 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -67,6 +67,9 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
 
   void Abort(Status status) override {}
 
+  // Remote node deletions are best effort
+  bool Fatal() const override { return false; }
+
   string DebugString() const override {
     string out = "[DestroyTensorHandleNode]";
     strings::StrAppend(&out, " request: ", request_->DebugString());
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 3b083f3cae6..5f260e477d6 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -57,6 +57,8 @@ class EagerClient : public core::RefCounted {
   virtual void StreamingEnqueueAsync(const EnqueueRequest* request,
                                      EnqueueResponse* response,
                                      StatusCallback done) = 0;
+
+  virtual bool allow_multiple_pending_requests() const = 0;
 };
 
 // Simple wrapper class that can be used to retrieve EagerClients.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f85e20db084..4f60b488144 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -433,7 +433,7 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
     } else if (item.has_cleanup_function()) {
       s = CleanupFunction(item.cleanup_function());
     } else {
-      DCHECK(item.has_clear_remote_executor_for_stream());
+      DCHECK(item.has_sync_remote_executor_for_stream());
       s = executor.WaitForAllPendingNodes();
     }
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 16ca9bbcc3d..f786d70e51f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -186,6 +186,9 @@ class EagerServiceImpl {
 
     void Abort(Status status) override {}
 
+    // Remote node deletions are best effort
+    bool Fatal() const override { return false; }
+
     string DebugString() const override {
       string out = "[ClientTensorHandleDeleteNode]";
       strings::StrAppend(&out, " op_id: ", handle_to_delete_->op_id);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 686f471ca5e..41db645507b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -96,6 +96,8 @@ class FakeEagerClient : public EagerClient {
     done(impl_->Enqueue(request, response));
   }
 
+  bool allow_multiple_pending_requests() const override { return false; }
+
  private:
   TestEagerServiceImpl* impl_;
 };
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index b020ed8944e..f5fc68a8e38 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -224,12 +224,15 @@ void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
   EnqueueResponse* response = new EnqueueResponse;
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   Device* recv_device = recv_device_;
+  uint64 context_view_id = ctx_->GetContextViewId();
   eager_client->StreamingEnqueueAsync(
       &request, response,
-      [captured_state, response, recv_device, done](const Status& s) {
+      [captured_state, response, recv_device, context_view_id,
+       done](const Status& s) {
         if (s.ok()) {
           Status status = captured_state->dst()->SetRemoteShape(
-              response->queue_response(0).shape(0), recv_device);
+              response->queue_response(0).shape(0), recv_device,
+              context_view_id);
           if (!status.ok()) {
             LOG(ERROR) << "Ignoring an error encountered when setting remote "
                           "shape of tensor received by remote Recv op: "
@@ -319,12 +322,14 @@ void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   captured_state->SetSrcShape(tensor.shape());
   Device* recv_device = recv_device_;
+  uint64 context_view_id = ctx_->GetContextViewId();
   eager_client->StreamingEnqueueAsync(
       &request, response,
-      [captured_state, response, recv_device, done](const Status& s) {
+      [captured_state, response, recv_device, context_view_id,
+       done](const Status& s) {
         if (s.ok()) {
           Status status = captured_state->dst()->SetRemoteShape(
-              captured_state->GetSrcShape(), recv_device);
+              captured_state->GetSrcShape(), recv_device, context_view_id);
           if (!status.ok()) {
             LOG(ERROR) << "Ignoring an error encountered when setting remote "
                           "shape of tensor received by SendTensor rpc: "
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
index 05980861a9d..f84e0ebb5ee 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -58,8 +58,8 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
 
   eager_client_->StreamingEnqueueAsync(
       request_.get(), response,
-      [inputs, retvals, response, device, rpc_description,
-       done](const Status& status) {
+      [inputs, retvals, response, device, context_view_id = context_view_id_,
+       rpc_description, done](const Status& status) {
         for (auto handle : inputs) {
           handle->Unref();
         }
@@ -72,7 +72,7 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
         for (size_t i = 0; i < retvals.size(); ++i) {
           if (status.ok()) {
             Status s = retvals[i]->SetRemoteShape(
-                response->queue_response(0).shape(i), device);
+                response->queue_response(0).shape(i), device, context_view_id);
             if (!s.ok()) {
               LOG(ERROR) << "Ignoring an error encountered when setting "
                             "remote shape of tensor handle: "
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index b0342fc5056..78a7dcb5b24 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -34,16 +34,19 @@ namespace eager {
 
 // RemoteExecuteNode is an implementation of EagerNode which enqueues
 // an operation via RPC in a remote EagerService.
-class RemoteExecuteNode : public AsyncEagerNode {
+class RemoteExecuteNode : public AsyncRemoteExecuteNode {
  public:
-  RemoteExecuteNode(std::unique_ptr<EnqueueRequest> request, Device* device,
-                    EagerClient* eager_client, const NodeDef& ndef,
-                    FunctionLibraryDefinition* lib_def,
+  RemoteExecuteNode(EagerContext* eager_context,
+                    std::unique_ptr<EnqueueRequest> request, Device* device,
+                    uint64 context_view_id, EagerClient* eager_client,
+                    const NodeDef& ndef, FunctionLibraryDefinition* lib_def,
                     const gtl::InlinedVector<TensorHandle*, 4>& inputs,
                     absl::Span<TensorHandle*> retvals)
-      : AsyncEagerNode(),
+      : AsyncRemoteExecuteNode(),
+        eager_context_(eager_context),
         request_(std::move(request)),
         device_(device),
+        context_view_id_(context_view_id),
         eager_client_(eager_client),
         ndef_(ndef),
         lib_def_(lib_def),
@@ -61,6 +64,16 @@ class RemoteExecuteNode : public AsyncEagerNode {
       handle->Ref();
     }
     eager_client_->Ref();
+
+    needs_remote_inputs_ = false;
+    for (const TensorHandle* input : inputs_) {
+      // TODO(bramandia): Should this be op_device() instead?
+      if (input->resource_device() != nullptr &&
+          input->resource_device() != device_) {
+        needs_remote_inputs_ = true;
+        break;
+      }
+    }
   }
 
   ~RemoteExecuteNode() override {
@@ -80,12 +93,22 @@ class RemoteExecuteNode : public AsyncEagerNode {
 
   void RunAsync(StatusCallback done) override;
 
+  Status SyncExecutors() override { return eager_context_->SyncExecutors(); }
+
   void Abort(Status status) override {
     for (auto handle : retvals_) {
       handle->Poison(status);
     }
   }
 
+  const EagerClient* eager_client() const override { return eager_client_; }
+
+  bool needs_remote_inputs() const override { return needs_remote_inputs_; }
+
+  bool allow_multiple_pending_requests() const override {
+    return eager_client_->allow_multiple_pending_requests();
+  }
+
   string DebugString() const override {
     string out = "[RemoteExecuteNode]";
     strings::StrAppend(&out, " request: ", request_->DebugString());
@@ -94,8 +117,11 @@ class RemoteExecuteNode : public AsyncEagerNode {
   }
 
  private:
+  EagerContext* eager_context_;  // Not owned, and must outlive this node.
   std::unique_ptr<EnqueueRequest> request_;
   Device* device_;             // Not owned
+  uint64 context_view_id_;
+  bool needs_remote_inputs_;
   EagerClient* eager_client_;  // Not owned, and must outlive this node.
   const NodeDef ndef_;
   const FunctionLibraryDefinition* lib_def_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index aefe86c654d..f2e6735c6b7 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -168,8 +168,7 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
           "Unable to find remote task corresponding to device ", device_name);
     }
     auto remote_handle_data = absl::make_unique<UnshapedRemoteTensorHandleData>(
-        in.op_id(), in.output_num(), remote_task, parent_->GetContextId(),
-        parent_);
+        in.op_id(), in.output_num(), remote_task, parent_);
     remote_handle_data->ReleaseRemoteTensorHandle();
     TF_RETURN_IF_ERROR(TensorHandle::CreateUnshapedRemoteHandle(
         std::move(remote_handle_data), in.dtype(), device, parent_, out));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index 312637b9965..3d2eb7f57cd 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -49,7 +49,6 @@ class RemoteMgrTest : public ::testing::Test {
         DeviceFactory::NewDevice("CPU", {}, "/job:worker/replica:0/task:0"));
     remote_device_ = devices.back().get();
     auto device_mgr = absl::make_unique<StaticDeviceMgr>(std::move(devices));
-    context_id_ = random::New64();
     tensorflow::Rendezvous* rendezvous =
         new tensorflow::IntraProcessRendezvous(device_mgr.get());
     ctx_ = new tensorflow::EagerContext(
@@ -64,7 +63,6 @@ class RemoteMgrTest : public ::testing::Test {
 
   Device* local_device_;
   Device* remote_device_;
-  uint64 context_id_;
   EagerContext* ctx_;
 };
 
@@ -78,7 +76,7 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
   const uint64 op_id = 2;
   const int output_num = 3;
   auto tensor_handle_data = absl::make_unique<RemoteTensorHandleData>(
-      op_id, output_num, t.shape(), /*remote_task=*/"", context_id_, ctx_);
+      op_id, output_num, t.shape(), /*remote_task=*/"", ctx_);
   TF_ASSERT_OK(
       handle->AddRemoteMirror(std::move(tensor_handle_data), remote_device_));
   RemoteTensorHandle remote_handle;
@@ -98,9 +96,8 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
   const int output_num = 1;
   TensorHandle* handle;
   TF_ASSERT_OK(TensorHandle::CreateRemoteHandle(
-      op_id, output_num, t.shape(), /*remote_task=*/"", context_id_, DT_FLOAT,
-      remote_device_,
-      /*resource_device=*/nullptr, ctx_, &handle));
+      op_id, output_num, t.shape(), /*remote_task=*/"", DT_FLOAT,
+      remote_device_, /*resource_device=*/nullptr, ctx_, &handle));
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
       handle, &remote_handle, remote_device_, remote_device_->name()));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index e083aedcc47..704bef5a253 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -86,24 +86,24 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
 RemoteTensorHandleData::RemoteTensorHandleData(int64 op_id, int output_num,
                                                const TensorShape& shape,
                                                const string& remote_task,
-                                               uint64 context_id,
                                                EagerContext* ctx)
     : op_id_(op_id),
       output_num_(output_num),
       shape_(shape),
       remote_task_(remote_task),
-      context_id_(context_id),
-      ctx_(ctx) {
+      context_id_(ctx->GetContextId()),
+      context_view_id_(ctx->GetContextViewId()),
+      ctx_(*ctx) {
   DCHECK(op_id_ >= 0 && output_num_ >= 0)
       << "Op ID and output num should be >= 0. Op ID: " << op_id
       << ", Output num: " << output_num;
-  ctx->Ref();
+  ctx_.Ref();
 }
 
 RemoteTensorHandleData::~RemoteTensorHandleData() {
-  DestroyRemoteTensorHandle(ctx_, remote_task_, context_id_, op_id_,
+  DestroyRemoteTensorHandle(&ctx_, remote_task_, context_id_, op_id_,
                             output_num_, /*ready=*/true);
-  ctx_->Unref();
+  ctx_.Unref();
 }
 
 Status RemoteTensorHandleData::Tensor(const tensorflow::Tensor** t) const {
@@ -152,26 +152,26 @@ string RemoteTensorHandleData::DebugString() const {
 }
 
 UnshapedRemoteTensorHandleData::UnshapedRemoteTensorHandleData(
-    int64 op_id, int32 output_num, const string& remote_task, uint64 context_id,
-    EagerContext* ctx)
+    int64 op_id, int32 output_num, const string& remote_task, EagerContext* ctx)
     : op_id_(op_id),
       output_num_(output_num),
       delete_remote_tensor_(true),
       remote_task_(remote_task),
-      context_id_(context_id),
-      ctx_(ctx) {
+      context_id_(ctx->GetContextId()),
+      context_view_id_(ctx->GetContextViewId()),
+      ctx_(*ctx) {
   DCHECK(op_id_ >= 0 && output_num_ >= 0)
       << "Op ID and output num should be >= 0. Op ID: " << op_id
       << ", Output num: " << output_num;
-  ctx->Ref();
+  ctx_.Ref();
 }
 
 UnshapedRemoteTensorHandleData::~UnshapedRemoteTensorHandleData() {
   if (delete_remote_tensor_) {
-    DestroyRemoteTensorHandle(ctx_, remote_task_, context_id_, op_id_,
+    DestroyRemoteTensorHandle(&ctx_, remote_task_, context_id_, op_id_,
                               output_num_, /*ready=*/false);
   }
-  ctx_->Unref();
+  ctx_.Unref();
 }
 
 Status UnshapedRemoteTensorHandleData::Tensor(
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 56c51beffb0..9f7db52b447 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -25,8 +25,7 @@ namespace tensorflow {
 class RemoteTensorHandleData : public TensorHandleData {
  public:
   RemoteTensorHandleData(int64 op_id, int output_num, const TensorShape& shape,
-                         const string& remote_task, uint64 context_id,
-                         EagerContext* ctx);
+                         const string& remote_task, EagerContext* ctx);
   ~RemoteTensorHandleData() override;
 
   // A remote tensor handle does not have a Tensor object, hence it can only
@@ -38,11 +37,14 @@ class RemoteTensorHandleData : public TensorHandleData {
   Status Dim(int dim_index, int64* dim) const override;
   Status NumElements(int64* num_elements) const override;
   Status Unprotect() override;
+  EagerContext& ctx() const { return ctx_; }
 
   string DebugString() const override;
 
   int64 op_id() const { return op_id_; }
   int32 output_num() const { return output_num_; }
+  uint64 context_id() const { return context_id_; }
+  uint64 context_view_id() const { return context_view_id_; }
 
  private:
   // IDs required when this class is representing a remote tensor handle.
@@ -51,7 +53,8 @@ class RemoteTensorHandleData : public TensorHandleData {
   const TensorShape shape_;
   string remote_task_;
   uint64 context_id_;
-  EagerContext* const ctx_;
+  uint64 context_view_id_;
+  EagerContext& ctx_;
 };
 
 // Async Remote Tensor Handle: A handle to a Tensor on a remote host. Once the
@@ -59,8 +62,7 @@ class RemoteTensorHandleData : public TensorHandleData {
 class UnshapedRemoteTensorHandleData : public TensorHandleData {
  public:
   UnshapedRemoteTensorHandleData(int64 op_id, int32 output_num,
-                                 const string& remote_task, uint64 context_id,
-                                 EagerContext* ctx);
+                                 const string& remote_task, EagerContext* ctx);
   ~UnshapedRemoteTensorHandleData() override;
 
   // Unshaped remote tensor handles are not ready and hence cannot satisfy any
@@ -79,7 +81,8 @@ class UnshapedRemoteTensorHandleData : public TensorHandleData {
   int32 output_num() const { return output_num_; }
   string remote_task() const { return remote_task_; }
   uint64 context_id() const { return context_id_; }
-  EagerContext* ctx() const { return ctx_; }
+  uint64 context_view_id() const { return context_view_id_; }
+  EagerContext& ctx() const { return ctx_; }
 
   // When constructed, UnshapedRemoteTensorHandleData owns the remote
   // TensorHandle and should delete it by issuing an RPC. Once the remote
@@ -97,7 +100,8 @@ class UnshapedRemoteTensorHandleData : public TensorHandleData {
   bool delete_remote_tensor_;
   string remote_task_;
   uint64 context_id_;
-  EagerContext* const ctx_;
+  uint64 context_view_id_;
+  EagerContext& ctx_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 5ad48118ae9..c1811303bc9 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -115,6 +115,10 @@ class GrpcEagerClient : public EagerClient {
   }
   ~GrpcEagerClient() override { thread_->Unref(); }
 
+  bool allow_multiple_pending_requests() const override {
+    return EnableStreaming();
+  }
+
 #define CLIENT_METHOD(method)                                             \
   void method##Async(const method##Request* request,                      \
                      method##Response* response, StatusCallback done)     \
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index cfe1d6081d8..29fb04640bf 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -264,7 +264,7 @@ TEST_F(ReadyNodeManagerTest, GetCurrNodeFirstReadyManager) {
   // should return it.
   EXPECT_EQ("Node6", manager.GetCurrNode()->name());
 
-  // Now insrets a few other nodes, but their time_ready's are even smaller than
+  // Now inserts a few other nodes, but their time_ready's are even smaller than
   // that of Node6. Before calling RemoveCurrNode(), GetCurrNode() should return
   // the same node, Node6, in this case.
   NodeDef node7;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e42de02b979..7f1323603cb 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -80,9 +80,7 @@ package_group(
 
 package_group(
     name = "optimizer_helper_friends",
-    packages = [
-        "//learning/brain/research/lather/...",
-    ],
+    packages = ["//learning/brain/research/lather/..."],
 )
 
 config_setting(
@@ -3245,6 +3243,22 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "non_max_suppression_op_benchmark_test",
+    srcs = ["non_max_suppression_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "resize_bilinear_op_test",
     srcs = ["resize_bilinear_op_test.cc"],
@@ -6109,6 +6123,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:resource_loader",
         "//third_party/eigen3",
     ],
 )
@@ -7990,6 +8005,11 @@ tf_cc_test_mkl(
     name = "mkl_dequantize_op_test",
     size = "small",
     srcs = ["mkl_dequantize_op_test.cc"],
+    # TODO(b/149940073): Re-enable.
+    tags = [
+        "no_oss",
+        "notap",
+    ],
     deps = [
         ":mkl_dequantize_op",
         ":mkl_tfconv_op",
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 4cf591859a9..69349b47eff 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -44,7 +45,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 // op's output at position 'output_index', using 'context' for the allocation to
 // ensure proper device placement.
 template <typename T>
-Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
+Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
               Tensor* output) {
   const int input_dims = inputs[0].dims();
   const TensorShape& input_shape = inputs[0].shape();
@@ -106,7 +107,7 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
 // applicable special case and wrote to the outputs. Otherwise acts as a no-op.
 template <typename T>
 Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
-                      const gtl::ArraySlice<int64>& sizes,
+                      const gtl::ArraySlice<int64> sizes,
                       std::vector<Tensor>* outputs, bool* done) {
   *done = false;
 
@@ -143,7 +144,7 @@ Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
 // Handles the general case, on CPU.
 template <typename T>
 Status SplitCPU(OpKernelContext* context, const Tensor& input,
-                const gtl::ArraySlice<int64>& sizes,
+                const gtl::ArraySlice<int64> sizes,
                 std::vector<Tensor>* outputs) {
   int64 suffix_dim_size = 1;
   for (int i = 1; i < input.shape().dims(); ++i) {
@@ -192,8 +193,7 @@ Status SplitGPU(OpKernelContext* context, const Tensor& input,
 // The outer function that dispatches to the various Split*() functions above.
 template <typename T>
 Status Split(OpKernelContext* context, const Tensor& input,
-             const gtl::ArraySlice<int64>& sizes,
-             std::vector<Tensor>* outputs) {
+             const gtl::ArraySlice<int64> sizes, std::vector<Tensor>* outputs) {
   bool easy_cases_done;
   TF_RETURN_IF_ERROR(
       SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
@@ -245,13 +245,13 @@ class BatchResource : public ResourceBase {
   Status RegisterInput(int64 guid, OpKernelContext* context,
                        const string& batcher_queue_name,
                        AsyncOpKernel::DoneCallback done_callback) {
-    std::unique_ptr<BatchTask> batch_components(new BatchTask);
+    auto batch_components = MakeUnique<BatchTask>();
     batch_components->guid = guid;
     batch_components->propagated_context = Context(ContextKind::kThread);
     OpInputList tensors;
     TF_RETURN_IF_ERROR(context->input_list("in_tensors", &tensors));
-    for (int i = 0; i < tensors.size(); ++i) {
-      const Tensor& tensor = tensors[i];
+    batch_components->inputs.reserve(tensors.size());
+    for (const Tensor& tensor : tensors) {
       if (tensor.shape().dims() == 0) {
         return errors::InvalidArgument(
             "Batching input tensors must have at least one dimension");
@@ -268,6 +268,7 @@ class BatchResource : public ResourceBase {
     const auto captured_status =
         context->input_list("captured_tensors", &captured_tensors);
     if (captured_status.ok()) {
+      batch_components->captured_inputs.reserve(captured_tensors.size());
       for (const Tensor& captured_tensor : captured_tensors) {
         batch_components->captured_inputs.push_back(captured_tensor);
       }
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index ccdafdf91c9..4f26aed641e 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -48,7 +48,7 @@ namespace tensorflow {
 namespace {
 
 NodeDef StripTensorDataFromNodeDef(OpKernelConstruction* ctx) {
-#ifndef __ANDROID__
+#ifndef TENSORFLOW_LITE_PROTOS
   DCHECK_EQ(NodeDef::descriptor()->field_count(), 6)
       << "The NodeDef format has changed, and the attr-stripping code may need "
       << "to be updated.";
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 753988fc005..f9bf64f2df3 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -975,6 +975,12 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       device_id,                           // device_id
       conv_desc.group_count()              // group_count
   };
+#if TENSORFLOW_USE_ROCM
+  // cudnn_use_autotune is applicable only the CUDA flow
+  // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
+  // if we do not have a cached algorithm_config for this conv_parameters
+  cudnn_use_autotune = true;
+#endif
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index a22ef6ebaf7..be5d821fc32 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -1143,6 +1143,12 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       device_id,                           // device_id
       conv_desc.group_count()              // group_count
   };
+#if TENSORFLOW_USE_ROCM
+  // cudnn_use_autotune is applicable only the CUDA flow
+  // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
+  // if we do not have a cached algorithm_config for this conv_parameters
+  cudnn_use_autotune = true;
+#endif
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
                                 conv_parameters, &algorithm_config)) {
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index d963c42f7f5..bc6c64963ad 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1368,6 +1368,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     using se::dnn::AlgorithmConfig;
     using se::dnn::AlgorithmDesc;
     using se::dnn::ProfileResult;
+#if TENSORFLOW_USE_ROCM
+    // cudnn_use_autotune is applicable only the CUDA flow
+    // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
+    // if we do not have a cached algorithm_config for this conv_parameters
+    cudnn_use_autotune_ = true;
+#endif
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
@@ -1857,6 +1863,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     using se::dnn::AlgorithmConfig;
     using se::dnn::AlgorithmDesc;
     using se::dnn::ProfileResult;
+#if TENSORFLOW_USE_ROCM
+    // cudnn_use_autotune is applicable only the CUDA flow
+    // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
+    // if we do not have a cached algorithm_config for this conv_parameters
+    cudnn_use_autotune_ = true;
+#endif
     AlgorithmConfig algorithm_config;
     if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
                                    conv_parameters, &algorithm_config)) {
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 7bdeae19ac4..d265e9d8f8b 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -983,6 +983,12 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
                                     device_id,                // device_id
                                     conv_desc.group_count()};
   AlgorithmConfig algorithm_config;
+#if TENSORFLOW_USE_ROCM
+  // cudnn_use_autotune is applicable only the CUDA flow
+  // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
+  // if we do not have a cached algorithm_config for this conv_parameters
+  cudnn_use_autotune = true;
+#endif
   if (cudnn_use_autotune &&
       !AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 5eb551fcf48..e9e11aebf61 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -443,6 +443,12 @@ struct LaunchConvOp<GPUDevice, T> {
     using se::dnn::AlgorithmDesc;
     using se::dnn::ProfileResult;
 
+#if TENSORFLOW_USE_ROCM
+    // cudnn_use_autotune is applicable only the CUDA flow
+    // for ROCm/MIOpen, we need to call GetMIOpenConvolveAlgorithms explicitly
+    // if we do not have a cached algorithm_config for this conv_parameters
+    cudnn_use_autotune = true;
+#endif
     AlgorithmConfig algorithm_config;
 
     if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index d041ab5ac6a..a68b3faeb37 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -434,6 +434,7 @@ tf_kernel_library(
     name = "snapshot_dataset_op",
     srcs = ["snapshot_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -441,6 +442,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/time",
     ],
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index ae3015bc833..68ee3c4c134 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <random>
 
 #include "absl/time/clock.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -32,7 +33,9 @@ limitations under the License.
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/snappy.h"
 #if !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/lib/io/snappy/snappy_inputbuffer.h"
 #include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
@@ -63,9 +66,6 @@ enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 // Defaults to 10 GiB per shard.
 const int64 kDefaultShardSizeBytes = 10LL * 1024 * 1024 * 1024;
 
-const int64 kSnappyWriterInputBufferSizeBytes = 16 << 20;   // 16 MiB
-const int64 kSnappyWriterOutputBufferSizeBytes = 16 << 20;  // 16 MiB
-
 // The reader input buffer size is deliberately large because the input reader
 // will throw an error if the compressed block length cannot fit in the input
 // buffer.
@@ -75,6 +75,8 @@ const int64 kSnappyReaderOutputBufferSizeBytes = 32 << 20;  // 32 MiB
 
 const size_t kHeaderSize = sizeof(uint64);
 
+const int64 kCurrentVersion = 1;
+
 constexpr char kModeAuto[] = "auto";
 constexpr char kModeWrite[] = "write";
 constexpr char kModeRead[] = "read";
@@ -95,6 +97,7 @@ constexpr char kState[] = "state";
 constexpr char kHashDir[] = "hash_dir";
 constexpr char kRunId[] = "run_id";
 constexpr char kRunDir[] = "run_dir";
+constexpr char kVersionStr[] = "version";
 constexpr char kFilenames[] = "filenames";
 constexpr char kCurrentFilenames[] = "current_filenames";
 constexpr char kElementsProduced[] = "elements_produced";
@@ -115,9 +118,9 @@ class SnapshotWriter {
   static constexpr const char* const kWriteStringPiece = "WriteStringPiece";
   static constexpr const char* const kWriteCord = "WriteCord";
 
-  explicit SnapshotWriter(WritableFile* dest, const string& compression_type =
-                                                  io::compression::kNone)
-      : dest_(dest), compression_type_(compression_type) {
+  explicit SnapshotWriter(WritableFile* dest, const string& compression_type,
+                          int version, const DataTypeVector& dtypes)
+      : dest_(dest), compression_type_(compression_type), version_(version) {
 #if defined(IS_SLIM_BUILD)
     if (compression_type != io::compression::kNone) {
       LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -134,41 +137,100 @@ class SnapshotWriter {
       TF_CHECK_OK(zlib_output_buffer->Init());
       dest_ = zlib_output_buffer;
       dest_is_owned_ = true;
-    } else if (compression_type == io::compression::kSnappy) {
-      io::SnappyOutputBuffer* snappy_output_buffer = new io::SnappyOutputBuffer(
-          dest, /*input_buffer_bytes=*/kSnappyWriterInputBufferSizeBytes,
-          /*output_buffer_bytes=*/kSnappyWriterOutputBufferSizeBytes);
-      dest_ = snappy_output_buffer;
-      dest_is_owned_ = true;
     }
 #endif  // IS_SLIM_BUILD
+    simple_tensor_mask_.reserve(dtypes.size());
+    for (const auto& dtype : dtypes) {
+      if (DataTypeCanUseMemcpy(dtype)) {
+        simple_tensor_mask_.push_back(true);
+        num_simple_++;
+      } else {
+        simple_tensor_mask_.push_back(false);
+        num_complex_++;
+      }
+    }
   }
 
-  Status WriteRecord(const StringPiece& data) {
-    profiler::TraceMe activity(
-        [&]() {
-          return absl::StrCat(kClassName, kSeparator, kWriteStringPiece);
-        },
-        profiler::TraceMeLevel::kInfo);
-    char header[kHeaderSize];
-    core::EncodeFixed64(header, data.size());
-    TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
-    return dest_->Append(data);
-  }
-
+  Status WriteTensors(const std::vector<Tensor>& tensors) {
+    if (compression_type_ != io::compression::kSnappy) {
+      experimental::SnapshotRecord record;
+      for (const auto& tensor : tensors) {
+        TensorProto* t = record.add_tensor();
+        tensor.AsProtoTensorContent(t);
+      }
 #if defined(PLATFORM_GOOGLE)
-  Status WriteRecord(const absl::Cord& data) {
-    profiler::TraceMe activity(
-        [&]() { return absl::StrCat(kClassName, kSeparator, kWriteCord); },
-        profiler::TraceMeLevel::kInfo);
-    char header[kHeaderSize];
-    core::EncodeFixed64(header, data.size());
-
-    TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
-
-    return dest_->Append(data);
-  }
+      return WriteRecord(record.SerializeAsCord());
+#else   // PLATFORM_GOOGLE
+      return WriteRecord(record.SerializeAsString());
 #endif  // PLATFORM_GOOGLE
+    }
+
+    if (version_ != 1) {
+      return errors::InvalidArgument("Version: ", version_,
+                                     " is not supported.");
+    }
+    if (compression_type_ != io::compression::kSnappy) {
+      return errors::InvalidArgument(
+          "Version 1 is only compatible with snappy compression");
+    }
+
+    std::vector<const TensorBuffer*> tensor_buffers;
+    tensor_buffers.reserve(num_simple_);
+    std::vector<TensorProto> tensor_protos;
+    tensor_protos.reserve(num_complex_);
+    SnapshotTensorMetadata metadata;
+    int64 total_size = 0;
+    for (int i = 0; i < tensors.size(); ++i) {
+      const Tensor& tensor = tensors[i];
+      TensorMetadata* tensor_metadata = metadata.add_tensor_metadata();
+      tensor.shape().AsProto(tensor_metadata->mutable_tensor_shape());
+      int64 size = 0;
+      if (simple_tensor_mask_[i]) {
+        auto tensor_buffer = DMAHelper::buffer(&tensor);
+        tensor_buffers.push_back(tensor_buffer);
+        size = tensor_buffer->size();
+      } else {
+        TensorProto proto;
+        tensor.AsProtoTensorContent(&proto);
+        size = proto.ByteSizeLong();
+        tensor_protos.push_back(std::move(proto));
+      }
+      tensor_metadata->set_tensor_size_bytes(size);
+      total_size += size;
+    }
+
+    std::vector<char> uncompressed(total_size);
+    char* position = uncompressed.data();
+    int buffer_index = 0;
+    int proto_index = 0;
+    for (int i = 0; i < tensors.size(); ++i) {
+      const auto& tensor_metadata = metadata.tensor_metadata(i);
+      if (simple_tensor_mask_[i]) {
+        memcpy(position, tensor_buffers[buffer_index]->data(),
+               tensor_metadata.tensor_size_bytes());
+        buffer_index++;
+      } else {
+        tensor_protos[proto_index].SerializeToArray(
+            position, tensor_metadata.tensor_size_bytes());
+        proto_index++;
+      }
+      position += tensor_metadata.tensor_size_bytes();
+    }
+    DCHECK_EQ(position, uncompressed.data() + total_size);
+
+    string output;
+    if (!port::Snappy_Compress(uncompressed.data(), total_size, &output)) {
+      return errors::Internal("Failed to compress using snappy.");
+    }
+#if defined(PLATFORM_GOOGLE)
+    absl::Cord metadata_serialized = metadata.SerializeAsCord();
+#else   // PLATFORM_GOOGLE
+    std::string metadata_serialized = metadata.SerializeAsString();
+#endif  // PLATFORM_GOOGLE
+    TF_RETURN_IF_ERROR(WriteRecord(metadata_serialized));
+    TF_RETURN_IF_ERROR(WriteRecord(output));
+    return Status::OK();
+  }
 
   Status Sync() { return dest_->Sync(); }
 
@@ -192,9 +254,29 @@ class SnapshotWriter {
   }
 
  private:
+  Status WriteRecord(const StringPiece& data) {
+    char header[kHeaderSize];
+    core::EncodeFixed64(header, data.size());
+    TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
+    return dest_->Append(data);
+  }
+
+#if defined(PLATFORM_GOOGLE)
+  Status WriteRecord(const absl::Cord& data) {
+    char header[kHeaderSize];
+    core::EncodeFixed64(header, data.size());
+    TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
+    return dest_->Append(data);
+  }
+#endif  // PLATFORM_GOOGLE
+
   WritableFile* dest_;
   bool dest_is_owned_ = false;
   const string compression_type_;
+  const int version_;
+  std::vector<bool> simple_tensor_mask_;  // true for simple, false for complex.
+  int num_simple_ = 0;
+  int num_complex_ = 0;
 };
 
 class SnapshotReader {
@@ -203,12 +285,14 @@ class SnapshotReader {
   static constexpr const char* const kReadString = "ReadString";
   static constexpr const char* const kReadCord = "ReadCord";
 
-  explicit SnapshotReader(
-      RandomAccessFile* file,
-      const string& compression_type = io::compression::kNone)
+  explicit SnapshotReader(RandomAccessFile* file,
+                          const string& compression_type, int version,
+                          const DataTypeVector& dtypes)
       : file_(file),
         input_stream_(new io::RandomAccessInputStream(file)),
-        compression_type_(compression_type) {
+        compression_type_(compression_type),
+        version_(version),
+        dtypes_(dtypes) {
 #if defined(IS_SLIM_BUILD)
     if (compression_type_ != io::compression::kNone) {
       LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -223,17 +307,167 @@ class SnapshotReader {
           input_stream_.release(), zlib_options.input_buffer_size,
           zlib_options.output_buffer_size, zlib_options, true);
     } else if (compression_type_ == io::compression::kSnappy) {
-      input_stream_ = absl::make_unique<io::SnappyInputBuffer>(
-          file_, /*input_buffer_bytes=*/kSnappyReaderInputBufferSizeBytes,
-          /*output_buffer_bytes=*/kSnappyReaderOutputBufferSizeBytes);
+      if (version_ == 0) {
+        input_stream_ = absl::make_unique<io::SnappyInputBuffer>(
+            file_, /*input_buffer_bytes=*/kSnappyReaderInputBufferSizeBytes,
+            /*output_buffer_bytes=*/kSnappyReaderOutputBufferSizeBytes);
+      } else {
+        input_stream_ =
+            absl::make_unique<io::BufferedInputStream>(file_, 64 << 20);
+      }
     }
 #endif  // IS_SLIM_BUILD
+    simple_tensor_mask_.reserve(dtypes.size());
+    for (const auto& dtype : dtypes) {
+      if (DataTypeCanUseMemcpy(dtype)) {
+        simple_tensor_mask_.push_back(true);
+        num_simple_++;
+      } else {
+        simple_tensor_mask_.push_back(false);
+        num_complex_++;
+      }
+    }
+  }
+
+  Status ReadTensors(std::vector<Tensor>* read_tensors) {
+    profiler::TraceMe activity(
+        [&]() { return absl::StrCat(kClassName, kSeparator, "ReadTensors"); },
+        profiler::TraceMeLevel::kInfo);
+    if (version_ == 0 || compression_type_ != io::compression::kSnappy) {
+      return ReadTensorsV0(read_tensors);
+    }
+    if (version_ != 1) {
+      return errors::InvalidArgument("Version: ", version_,
+                                     " is not supported.");
+    }
+    if (compression_type_ != io::compression::kSnappy) {
+      return errors::InvalidArgument("Version 1 only supports snappy.");
+    }
+
+    SnapshotTensorMetadata metadata;
+    tstring metadata_str;
+    TF_RETURN_IF_ERROR(ReadRecord(&metadata_str));
+    if (!metadata.ParseFromArray(metadata_str.data(), metadata_str.size())) {
+      return errors::DataLoss("Could not parse SnapshotTensorMetadata");
+    }
+    read_tensors->reserve(metadata.tensor_metadata_size());
+
+    std::vector<Tensor> simple_tensors;
+    simple_tensors.reserve(num_simple_);
+    std::vector<std::pair<std::unique_ptr<char[]>, size_t>> tensor_proto_strs;
+    tensor_proto_strs.reserve(num_complex_);
+    TF_RETURN_IF_ERROR(
+        SnappyUncompress(metadata, &simple_tensors, &tensor_proto_strs));
+
+    int simple_index = 0;
+    int complex_index = 0;
+    for (int i = 0; i < simple_tensor_mask_.size(); ++i) {
+      if (simple_tensor_mask_[i]) {
+        read_tensors->push_back(std::move(simple_tensors[simple_index]));
+        simple_index++;
+      } else {
+        auto tensor_proto_str =
+            std::move(tensor_proto_strs[complex_index].first);
+        size_t tensor_proto_size = tensor_proto_strs[complex_index].second;
+        TensorProto tp;
+#if defined(PLATFORM_GOOGLE)
+        auto tensor_proto_ptr = tensor_proto_str.release();
+        absl::Cord c;
+        c.AppendExternalMemory(
+            absl::string_view(tensor_proto_ptr, tensor_proto_size),
+            tensor_proto_ptr,
+            [](void* arg) { delete[] static_cast<char*>(arg); });
+        if (!tp.ParseFromCord(c)) {
+          return errors::Internal("Could not parse TensorProto");
+        }
+#else   // PLATFORM_GOOGLE
+        if (!tp.ParseFromArray(tensor_proto_str.get(), tensor_proto_size)) {
+          return errors::Internal("Could not parse TensorProto");
+        }
+#endif  // PLATFORM_GOOGLE
+        Tensor t;
+        if (!t.FromProto(tp)) {
+          return errors::Internal("Could not parse Tensor");
+        }
+        read_tensors->push_back(std::move(t));
+        complex_index++;
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Status ReadTensorsV0(std::vector<Tensor>* read_tensors) {
+    experimental::SnapshotRecord record;
+#if defined(PLATFORM_GOOGLE)
+    absl::Cord c;
+    TF_RETURN_IF_ERROR(ReadRecord(&c));
+    record.ParseFromCord(c);
+#else   // PLATFORM_GOOGLE
+    tstring record_bytes;
+    TF_RETURN_IF_ERROR(ReadRecord(&record_bytes));
+    record.ParseFromArray(record_bytes.data(), record_bytes.size());
+#endif  // PLATFORM_GOOGLE
+    read_tensors->reserve(record.tensor_size());
+    for (int i = 0; i < record.tensor_size(); ++i) {
+      read_tensors->emplace_back();
+      if (!read_tensors->back().FromProto(record.tensor(i))) {
+        return errors::DataLoss("Unable to parse tensor from proto.");
+      }
+    }
+    return Status::OK();
+  }
+
+  Status SnappyUncompress(
+      const SnapshotTensorMetadata& metadata,
+      std::vector<Tensor>* simple_tensors,
+      std::vector<std::pair<std::unique_ptr<char[]>, size_t>>*
+          tensor_proto_strs) {
+    tstring compressed;
+    TF_RETURN_IF_ERROR(ReadRecord(&compressed));
+    size_t size;
+    if (!port::Snappy_GetUncompressedLength(compressed.data(),
+                                            compressed.size(), &size)) {
+      return errors::Internal("Could not get snappy uncompressed length");
+    }
+
+    int num_tensors = metadata.tensor_metadata_size();
+    std::vector<struct iovec> iov(num_tensors);
+    int index = 0;
+    int64 total_size = 0;
+    for (int i = 0; i < simple_tensor_mask_.size(); ++i) {
+      const auto& tensor_metadata = metadata.tensor_metadata(i);
+      if (simple_tensor_mask_[i]) {
+        TensorShape shape(tensor_metadata.tensor_shape());
+        Tensor simple_tensor(dtypes_[i], shape);
+        TensorBuffer* buffer = DMAHelper::buffer(&simple_tensor);
+        iov[index].iov_base = buffer->data();
+        iov[index].iov_len = buffer->size();
+        simple_tensors->push_back(std::move(simple_tensor));
+      } else {
+        auto tensor_proto_str =
+            absl::make_unique<char[]>(tensor_metadata.tensor_size_bytes());
+        iov[index].iov_base = tensor_proto_str.get();
+        iov[index].iov_len = tensor_metadata.tensor_size_bytes();
+        tensor_proto_strs->push_back(std::make_pair(
+            std::move(tensor_proto_str), tensor_metadata.tensor_size_bytes()));
+      }
+      total_size += iov[index].iov_len;
+      index++;
+    }
+    if (size != total_size) {
+      return errors::Internal("Uncompressed size mismatch. Snappy expects ",
+                              size, " whereas the tensor metadata suggests ",
+                              total_size);
+    }
+    if (!port::Snappy_UncompressToIOVec(compressed.data(), compressed.size(),
+                                        iov.data(), num_tensors)) {
+      return errors::Internal("Failed to perform snappy decompression.");
+    }
+    return Status::OK();
   }
 
   Status ReadRecord(tstring* record) {
-    profiler::TraceMe activity(
-        [&]() { return absl::StrCat(kClassName, kSeparator, kReadString); },
-        profiler::TraceMeLevel::kInfo);
     tstring header;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(kHeaderSize, &header));
     uint64 length = core::DecodeFixed64(header.data());
@@ -245,13 +479,6 @@ class SnapshotReader {
     tstring header;
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(kHeaderSize, &header));
     uint64 length = core::DecodeFixed64(header.data());
-    profiler::TraceMe activity(
-        [&]() {
-          return absl::StrCat(kClassName, kSeparator, kReadCord,
-                              "#length=", length, "#");
-        },
-        profiler::TraceMeLevel::kInfo);
-
     if (compression_type_ == io::compression::kNone) {
       return input_stream_->ReadNBytes(length, record);
     } else {
@@ -268,50 +495,31 @@ class SnapshotReader {
   }
 #endif
 
- private:
   RandomAccessFile* file_;
   std::unique_ptr<io::InputStreamInterface> input_stream_;
   const string compression_type_;
+  const int version_;
+  const DataTypeVector dtypes_;
+  int num_simple_ = 0;
+  int num_complex_ = 0;
+  std::vector<bool> simple_tensor_mask_;  // true for simple, false for complex.
 };
 
 Status WriteMetadataFile(const string& hash_dir,
                          const experimental::SnapshotMetadataRecord& metadata) {
   string metadata_filename = io::JoinPath(hash_dir, kSnapshotFilename);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(hash_dir));
-
   std::string tmp_filename =
       absl::StrCat(metadata_filename, "-tmp-", random::New64());
-
-  std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(tmp_filename, &file));
-
-  auto writer = absl::make_unique<SnapshotWriter>(file.get());
-  TF_RETURN_IF_ERROR(writer->WriteRecord(metadata.SerializeAsString()));
-  TF_RETURN_IF_ERROR(writer->Close());
-  TF_RETURN_IF_ERROR(file->Sync());
-  TF_RETURN_IF_ERROR(file->Close());
-
-  TF_RETURN_IF_ERROR(
-      Env::Default()->RenameFile(tmp_filename, metadata_filename));
-
-  return Status::OK();
+  TF_RETURN_IF_ERROR(WriteBinaryProto(Env::Default(), tmp_filename, metadata));
+  return Env::Default()->RenameFile(tmp_filename, metadata_filename);
 }
 
 Status ReadMetadataFile(const string& hash_dir,
                         experimental::SnapshotMetadataRecord* metadata) {
   string metadata_filename = io::JoinPath(hash_dir, kSnapshotFilename);
   TF_RETURN_IF_ERROR(Env::Default()->FileExists(metadata_filename));
-
-  std::unique_ptr<RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(
-      Env::Default()->NewRandomAccessFile(metadata_filename, &file));
-
-  tstring record_bytes;
-  SnapshotReader reader(file.get());
-  TF_RETURN_IF_ERROR(reader.ReadRecord(&record_bytes));
-
-  metadata->ParseFromArray(record_bytes.data(), record_bytes.size());
-  return Status::OK();
+  return ReadBinaryProto(Env::Default(), metadata_filename, metadata);
 }
 
 Status DumpDatasetGraph(const std::string& path, uint64 hash,
@@ -332,6 +540,10 @@ Status DetermineOpState(const std::string& mode_string,
                         const uint64 pending_snapshot_expiry_seconds,
                         SnapshotMode* mode) {
   if (mode_string == kModeRead) {
+    // In read mode, we should expect a metadata file is written.
+    if (errors::IsNotFound(file_status)) {
+      return file_status;
+    }
     LOG(INFO) << "Overriding mode to reader.";
     *mode = READER;
     return Status::OK();
@@ -727,10 +939,25 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             if (run_id.empty()) {
               run_id = metadata.run_id();
             }
+            // dtypes in metadata should be the same as dataset()->output_dtypes
+            if (metadata.dtype_size() != dataset()->output_dtypes().size()) {
+              return errors::Internal(
+                  "Expected number of dtypes: ",
+                  dataset()->output_dtypes().size(),
+                  " but number in snapshot: ", metadata.dtype_size());
+            }
+            for (int i = 0; i < metadata.dtype_size(); ++i) {
+              if (metadata.dtype(i) != dataset()->output_dtypes()[i]) {
+                return errors::Internal(
+                    "Type: ", i,
+                    " doesn't match. Snapshot: ", metadata.dtype(i),
+                    "; dataset: ", dataset()->output_dtypes()[i]);
+              }
+            }
             iterator_ = absl::make_unique<SnapshotReaderIterator>(
                 SnapshotReaderIterator::Params{
                     dataset(), absl::StrCat(prefix(), "ReaderImpl")},
-                hash_dir_, run_id);
+                hash_dir_, run_id, metadata.version());
             break;
           case PASSTHROUGH:
             iterator_ = absl::make_unique<SnapshotPassthroughIterator>(
@@ -748,10 +975,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
         explicit SnapshotReaderIterator(const Params& params,
                                         const string& hash_dir,
-                                        const string& run_id)
+                                        const string& run_id, int64 version)
             : DatasetIterator<Dataset>(params),
               hash_dir_(hash_dir),
-              run_id_(run_id) {}
+              run_id_(run_id),
+              version_(version) {}
 
         ~SnapshotReaderIterator() override {
           mutex_lock l(mu_);
@@ -889,6 +1117,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               writer->WriteScalar(full_name(kHashDir), hash_dir_));
           TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kRunId), run_id_));
           TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kRunDir), run_dir_));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(kVersionStr), version_));
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               full_name(strings::StrCat(kFilenames, kSizeSuffix)),
               filenames_.size()));
@@ -932,6 +1162,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           }
           TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kRunId), &run_id_));
           TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kRunDir), &run_dir_));
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name(kVersionStr), &version_));
           curr_filenames_.clear();
           curr_filenames_.reserve(dataset()->num_reader_threads_);
           for (auto i = 0; i < dataset()->num_reader_threads_; ++i) {
@@ -986,7 +1218,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           std::unique_ptr<RandomAccessFile> file;
           TF_RETURN_IF_ERROR(
               Env::Default()->NewRandomAccessFile(filename, &file));
-          SnapshotReader reader(file.get(), dataset()->compression_);
+          SnapshotReader reader(file.get(), dataset()->compression_, version_,
+                                dataset()->output_dtypes());
 
           while (true) {
             // Wait for a slot in the buffer.
@@ -1003,30 +1236,14 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                     "ReadFile");
               }
             }
-#if !defined(PLATFORM_GOOGLE)
-            tstring record_bytes;
-            Status s = reader.ReadRecord(&record_bytes);
-#else
-            absl::Cord record_cord;
-            Status s = reader.ReadRecord(&record_cord);
-#endif
+            std::vector<Tensor> read_tensors;
+            Status s = reader.ReadTensors(&read_tensors);
             if (s.ok()) {
               profiler::TraceMe activity(
                   [&]() { return absl::StrCat(prefix(), kSeparator, kParse); },
                   profiler::TraceMeLevel::kInfo);
-              experimental::SnapshotRecord record;
-#if !defined(PLATFORM_GOOGLE)
-              record.ParseFromArray(record_bytes.data(), record_bytes.size());
-#else
-              record.ParseFromCord(record_cord);
-#endif
               BufferElement elem;
-              for (int i = 0; i < record.tensor_size(); ++i) {
-                elem.value.emplace_back();
-                if (!elem.value.back().FromProto(record.tensor(i))) {
-                  return errors::DataLoss("Unable to parse tensor from proto.");
-                }
-              }
+              elem.value = std::move(read_tensors);
               elem.status = Status::OK();
               mutex_lock l(mu_);
               buffer_.push_back(std::move(elem));
@@ -1142,9 +1359,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         condition_variable cond_var_;
 
         const string hash_dir_;
-        const experimental::SnapshotMetadataRecord metadata_;
         tstring run_id_ GUARDED_BY(mu_);
         tstring run_dir_ GUARDED_BY(mu_);
+        int64 version_;
         std::vector<tstring> filenames_;
 
         uint64 elements_produced_ GUARDED_BY(mu_) = 0;
@@ -1220,6 +1437,10 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                 metadata.set_creation_timestamp(EnvTime::NowMicros());
                 metadata.set_graph_hash(dataset()->graph_hash_);
                 metadata.set_run_id(run_id_.data(), run_id_.size());
+                metadata.set_version(kCurrentVersion);
+                for (const auto& output_dtype : dataset()->output_dtypes()) {
+                  metadata.add_dtype(output_dtype);
+                }
                 metadata.set_finalized(false);
                 TF_RETURN_IF_ERROR(WriteMetadataFile(hash_dir_, metadata));
               }
@@ -1564,11 +1785,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           }
 
           if (produced_elem) {
-            experimental::SnapshotRecord record;
             for (const auto& out_tensor : elem.value) {
               *bytes_written += out_tensor.TotalBytes();
-              TensorProto* t = record.add_tensor();
-              out_tensor.AsProtoTensorContent(t);
             }
 
             bool should_close;
@@ -1584,16 +1802,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               TF_RETURN_IF_ERROR(Env::Default()->NewAppendableFile(
                   *snapshot_data_filename, file));
               *writer = absl::make_unique<SnapshotWriter>(
-                  file->get(), dataset()->compression_);
+                  file->get(), dataset()->compression_, kCurrentVersion,
+                  dataset()->output_dtypes());
               *bytes_written = 0;
             }
-#if defined(PLATFORM_GOOGLE)
-            TF_RETURN_IF_ERROR(
-                (*writer)->WriteRecord(record.SerializeAsCord()));
-#else   // PLATFORM_GOOGLE
-            TF_RETURN_IF_ERROR(
-                (*writer)->WriteRecord(record.SerializeAsString()));
-#endif  // PLATFORM_GOOGLE
+            TF_RETURN_IF_ERROR((*writer)->WriteTensors(elem.value));
             return Status::OK();
           }
 
@@ -1641,7 +1854,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             return;
           }
           std::unique_ptr<SnapshotWriter> writer(
-              new SnapshotWriter(file.get(), dataset()->compression_));
+              new SnapshotWriter(file.get(), dataset()->compression_,
+                                 kCurrentVersion, dataset()->output_dtypes()));
 
           bool end_of_processing = false;
           while (!end_of_processing) {
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index ce1b5ffb11e..c249d82ee9b 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -55,9 +55,15 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   auto config_factory = [this, &optimizations]() {
     return CreateConfig(optimizations, optimization_configs_);
   };
-  OP_REQUIRES_OK(ctx, RewriteDataset(ctx, input, std::move(config_factory),
-                                     /*optimize_function_library=*/true,
-                                     /*record_fingerprint=*/true, output));
+  Status s = RewriteDataset(ctx, input, std::move(config_factory),
+                            /*optimize_function_library=*/true,
+                            /*record_fingerprint=*/true, output);
+  if (errors::IsDeadlineExceeded(s)) {
+    // Ignore DeadlineExceeded as it implies that the attempted rewrite took too
+    // long which should not prevent further computation.
+    return;
+  }
+  OP_REQUIRES_OK(ctx, s);
 }
 
 RewriterConfig OptimizeDatasetOp::CreateConfig(
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index b2a78794d36..8b6658167ea 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -38,6 +38,15 @@ namespace data {
 
 constexpr char kCurrentFileIndex[] = "current_file_index";
 constexpr char kOffset[] = "offset";
+constexpr char kGcsFsPrefix[] = "gs://";
+constexpr int64 kCloudTpuBlockSize = 127LL << 20;  // 127MB.
+
+bool is_cloud_tpu_gcs_fs() {
+#if defined(PLATFORM_CLOUD_TPU) && defined(TPU_GCS_FS)
+  return true;
+#endif
+  return false;
+}
 
 class TFRecordDatasetOp::Dataset : public DatasetBase {
  public:
@@ -224,11 +233,13 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
       ctx, filenames_tensor->dims() <= 1,
       errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
+  bool is_gcs_fs = true;
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
     VLOG(2) << "Reading file: " << filenames_tensor->flat<tstring>()(i);
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
+    is_gcs_fs &= absl::StartsWith(filenames[i], kGcsFsPrefix);
   }
 
   tstring compression_type;
@@ -242,6 +253,14 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
               errors::InvalidArgument(
                   "`buffer_size` must be >= 0 (0 == no buffering)"));
 
+  if (is_gcs_fs && is_cloud_tpu_gcs_fs() && buffer_size < kCloudTpuBlockSize) {
+    LOG(WARNING) << "User buffer size is too small for reading Cloud TPU "
+                 << "TFRecords stored in GCS. Overriding " << buffer_size
+                 << " to the minimum recommended buffer_size = "
+                 << kCloudTpuBlockSize;
+    buffer_size = kCloudTpuBlockSize;
+  }
+
   *output =
       new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
 }
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index d7b00735cfc..9d64a4ba896 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -315,28 +315,17 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   const string& source_device = lib->device()->name();
   const Tensor* target;
   OP_REQUIRES_OK_ASYNC(ctx, ctx->input("target", &target), done);
-  string target_device;
+
+  FunctionTarget function_target;
   OP_REQUIRES_OK_ASYNC(
       ctx,
-      DeviceNameUtils::CanonicalizeDeviceName(target->scalar<tstring>()(),
-                                              source_device, &target_device),
+      DeviceNameUtils::CanonicalizeDeviceName(
+          target->scalar<tstring>()(), source_device, &function_target.first),
       done);
+  function_target.second = lib;
 
-  std::string func_name = func_.name();
-  AttrValueMap attr_values = func_.attr();
-
-  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
-
-  const auto* config = (ctx->function_library())
-                           ? ctx->function_library()->config_proto()
-                           : nullptr;
-  if (config) {
-    instantiate_opts.config_proto = *config;
-  }
-
-  instantiate_opts.target = target_device;
-
-  FunctionTarget function_target = {target_device, lib};
+  const string& target_device = function_target.first;
+  const string& func_name = func_.name();
 
   FunctionLibraryRuntime::Handle handle;
   {
@@ -352,8 +341,16 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
                                    " on ", target_device);
           },
           profiler::TraceMeLevel::kInfo);
+      FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+      const auto* config = (ctx->function_library())
+                               ? ctx->function_library()->config_proto()
+                               : nullptr;
+      if (config) {
+        instantiate_opts.config_proto = *config;
+      }
+      instantiate_opts.target = target_device;
       OP_REQUIRES_OK_ASYNC(ctx,
-                           lib->Instantiate(func_name, AttrSlice(&attr_values),
+                           lib->Instantiate(func_name, AttrSlice(&func_.attr()),
                                             instantiate_opts, &handle),
                            done);
       auto insert_result = handle_cache_.insert({function_target, handle});
@@ -373,23 +370,17 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     opts.remote_execution = true;
   }
   opts.create_rendezvous = true;
-  std::vector<Tensor> args;
-  args.reserve(arguments.size());
-  for (const Tensor& argument : arguments) {
-    args.push_back(argument);
-  }
+  std::vector<Tensor> args(arguments.begin(), arguments.end());
+  opts.args_alloc_attrs.reserve(input_dtypes_.size());
   for (const auto& dtype : input_dtypes_) {
     AllocatorAttributes arg_alloc_attrs;
-    if (DataTypeAlwaysOnHost(dtype)) {
-      arg_alloc_attrs.set_on_host(true);
-    }
+    arg_alloc_attrs.set_on_host(DataTypeAlwaysOnHost(dtype));
     opts.args_alloc_attrs.push_back(arg_alloc_attrs);
   }
+  opts.rets_alloc_attrs.reserve(output_dtypes_.size());
   for (const auto& dtype : output_dtypes_) {
     AllocatorAttributes ret_alloc_attrs;
-    if (DataTypeAlwaysOnHost(dtype)) {
-      ret_alloc_attrs.set_on_host(true);
-    }
+    ret_alloc_attrs.set_on_host(DataTypeAlwaysOnHost(dtype));
     opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
   }
   auto* rets = new std::vector<Tensor>;
@@ -405,12 +396,14 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       profiler::TraceMeLevel::kInfo);
   lib->Run(
       opts, handle, args, rets,
-      [rets, done, func_name, ctx, opts, target_device](const Status& status) {
+      [rets, done = std::move(done), func_name, ctx,
+       function_step_id = opts.step_id,
+       target_device = std::move(function_target.first)](const Status& status) {
         profiler::TraceMe activity(
             [&] {
               return absl::StrCat("RemoteCallOpDone#func_name=", func_name,
                                   ",parent_step_id=", ctx->step_id(),
-                                  ",function_step_id=", opts.step_id,
+                                  ",function_step_id=", function_step_id,
                                   ",device=", target_device, "#");
             },
             profiler::TraceMeLevel::kInfo);
diff --git a/tensorflow/core/kernels/mkl_dequantize_op_test.cc b/tensorflow/core/kernels/mkl_dequantize_op_test.cc
index 975ffa1b1ee..cfcc53c2f9a 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifdef INTEL_MKL
 
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -140,3 +141,4 @@ TEST_F(MklDequantizeOpTest, MKLInput) {
 }
 
 }  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index aa743f10504..6427f805874 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -53,7 +53,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormFwdPrimitive(const MklBatchNormFwdParams& fwdParams)
       : cpu_engine_(engine::cpu, 0) {
-    context_.fwd_stream.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+#ifndef ENABLE_MKLDNN_V1
+    context_.fwd_stream.reset(
+        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
+#endif
     if (context_.bn_fwd == nullptr) Setup(fwdParams);
   }
 
@@ -299,7 +302,10 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormBwdPrimitive(const MklBatchNormBwdParams& bwdParams)
       : cpu_engine_(engine::cpu, 0) {
-    context_.bwd_stream.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+#ifndef ENABLE_MKLDNN_V1
+    context_.bwd_stream.reset(
+        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
+#endif
     if (context_.bn_bwd == nullptr) Setup(bwdParams);
   }
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index f0f6a5c04a9..20ae3a2e0d0 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -284,14 +284,176 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& scores,
   }
 }
 
+struct ResultCandidate {
+  int box_index;
+  float score;
+  int class_idx;
+  float box_coord[4];
+};
+
+void DoNMSPerClass(int batch_idx, int class_idx, const float* boxes_data,
+                   const float* scores_data, int num_boxes, int q,
+                   int num_classes, const int size_per_class,
+                   const float score_threshold, const float iou_threshold,
+                   std::vector<ResultCandidate>& result_candidate_vec) {
+  std::vector<float> class_scores_data;
+  class_scores_data.reserve(num_boxes);
+  std::vector<float> class_boxes_data;
+  class_boxes_data.reserve(num_boxes * 4);
+
+  for (int box_idx = 0; box_idx < num_boxes; ++box_idx) {
+    class_scores_data.push_back(scores_data[box_idx * num_classes + class_idx]);
+    for (int cid = 0; cid < 4; ++cid) {
+      if (q > 1) {
+        class_boxes_data.push_back(
+            boxes_data[(box_idx * q + class_idx) * 4 + cid]);
+      } else {
+        class_boxes_data.push_back(boxes_data[box_idx * 4 + cid]);
+      }
+    }
+  }
+
+  // Copy class_boxes_data to a tensor
+  TensorShape boxesShape({num_boxes, 4});
+  Tensor boxes(DT_FLOAT, boxesShape);
+  std::copy_n(class_boxes_data.begin(), class_boxes_data.size(),
+              boxes.unaligned_flat<float>().data());
+
+  // Do NMS, get the candidate indices of form vector<int>
+  // Data structure for selection candidate in NMS.
+  struct Candidate {
+    int box_index;
+    float score;
+  };
+  auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+    return bs_i.score > bs_j.score;
+  };
+  std::vector<Candidate> candidate_vector;
+  for (int i = 0; i < class_scores_data.size(); ++i) {
+    if (class_scores_data[i] > score_threshold) {
+      candidate_vector.emplace_back(Candidate({i, class_scores_data[i]}));
+    }
+  }
+
+  std::vector<int> selected;
+  std::vector<float> selected_boxes;
+  Candidate next_candidate;
+
+  std::sort(candidate_vector.begin(), candidate_vector.end(), cmp);
+  const Tensor const_boxes = boxes;
+  typename TTypes<float, 2>::ConstTensor boxes_data_t =
+      const_boxes.tensor<float, 2>();
+  int candidate_idx = 0;
+  float iou;
+  while (selected.size() < size_per_class &&
+         candidate_idx < candidate_vector.size()) {
+    next_candidate = candidate_vector[candidate_idx++];
+
+    // Overlapping boxes are likely to have similar scores,
+    // therefore we iterate through the previously selected boxes backwards
+    // in order to see if `next_candidate` should be suppressed.
+    bool should_select = true;
+    for (int j = selected.size() - 1; j >= 0; --j) {
+      iou = IOU<float>(boxes_data_t, next_candidate.box_index, selected[j]);
+      if (iou > iou_threshold) {
+        should_select = false;
+        break;
+      }
+    }
+
+    if (should_select) {
+      // Add the selected box to the result candidate. Sorted by score
+      int id = next_candidate.box_index;
+      result_candidate_vec[selected.size() + size_per_class * class_idx] = {
+          next_candidate.box_index,
+          next_candidate.score,
+          class_idx,
+          {boxes_data_t(id, 0), boxes_data_t(id, 1), boxes_data_t(id, 2),
+           boxes_data_t(id, 3)}};
+      selected.push_back(next_candidate.box_index);
+    }
+  }
+}
+
+void SelectResultPerBatch(std::vector<float>& nmsed_boxes,
+                          std::vector<float>& nmsed_scores,
+                          std::vector<float>& nmsed_classes,
+                          std::vector<ResultCandidate>& result_candidate_vec,
+                          std::vector<int>& final_valid_detections,
+                          const int batch_idx, int total_size_per_batch,
+                          bool pad_per_class, int max_size_per_batch,
+                          bool clip_boxes, int per_batch_size) {
+  auto rc_cmp = [](const ResultCandidate rc_i, const ResultCandidate rc_j) {
+    return rc_i.score > rc_j.score;
+  };
+  std::sort(result_candidate_vec.begin(), result_candidate_vec.end(), rc_cmp);
+
+  int max_detections = 0;
+  int result_candidate_size =
+      std::count_if(result_candidate_vec.begin(), result_candidate_vec.end(),
+                    [](ResultCandidate rc) { return rc.box_index > -1; });
+  // If pad_per_class is false, we always pad to max_total_size
+  if (!pad_per_class) {
+    max_detections = std::min(result_candidate_size, total_size_per_batch);
+  } else {
+    max_detections = std::min(per_batch_size, result_candidate_size);
+  }
+
+  final_valid_detections[batch_idx] = max_detections;
+
+  int curr_total_size = max_detections;
+  int result_idx = 0;
+  // Pick the top max_detections values
+  while (curr_total_size > 0 && result_idx < result_candidate_vec.size()) {
+    ResultCandidate next_candidate = result_candidate_vec[result_idx++];
+    // Add to final output vectors
+    if (clip_boxes) {
+      const float box_min = 0.0;
+      const float box_max = 1.0;
+      nmsed_boxes.push_back(
+          std::max(std::min(next_candidate.box_coord[0], box_max), box_min));
+      nmsed_boxes.push_back(
+          std::max(std::min(next_candidate.box_coord[1], box_max), box_min));
+      nmsed_boxes.push_back(
+          std::max(std::min(next_candidate.box_coord[2], box_max), box_min));
+      nmsed_boxes.push_back(
+          std::max(std::min(next_candidate.box_coord[3], box_max), box_min));
+    } else {
+      nmsed_boxes.push_back(next_candidate.box_coord[0]);
+      nmsed_boxes.push_back(next_candidate.box_coord[1]);
+      nmsed_boxes.push_back(next_candidate.box_coord[2]);
+      nmsed_boxes.push_back(next_candidate.box_coord[3]);
+    }
+    nmsed_scores.push_back(next_candidate.score);
+    nmsed_classes.push_back(next_candidate.class_idx);
+    curr_total_size--;
+  }
+
+  nmsed_boxes.resize(per_batch_size * 4, 0);
+  nmsed_scores.resize(per_batch_size, 0);
+  nmsed_classes.resize(per_batch_size, 0);
+}
+
 void BatchedNonMaxSuppressionOp(
     OpKernelContext* context, const Tensor& inp_boxes, const Tensor& inp_scores,
     int num_boxes, const int max_size_per_class, const int total_size_per_batch,
     const float score_threshold, const float iou_threshold,
     bool pad_per_class = false, bool clip_boxes = true) {
-  int q = inp_boxes.dim_size(2);
-  int num_classes = inp_scores.dim_size(2);
   const int num_batches = inp_boxes.dim_size(0);
+  int num_classes = inp_scores.dim_size(2);
+  int q = inp_boxes.dim_size(2);
+
+  const float* scores_data =
+      const_cast<float*>(inp_scores.flat<float>().data());
+  const float* boxes_data = const_cast<float*>(inp_boxes.flat<float>().data());
+
+  int boxes_per_batch = num_boxes * q * 4;
+  int scores_per_batch = num_boxes * num_classes;
+  const int size_per_class = std::min(max_size_per_class, num_boxes);
+  std::vector<std::vector<ResultCandidate>> result_candidate_vec(
+      num_batches,
+      std::vector<ResultCandidate>(size_per_class * num_classes,
+                                   {-1, -1.0, -1, {0.0, 0.0, 0.0, 0.0}}));
 
   // [num_batches, per_batch_size * 4]
   std::vector<std::vector<float>> nmsed_boxes(num_batches);
@@ -300,166 +462,71 @@ void BatchedNonMaxSuppressionOp(
   // [num_batches, per_batch_size]
   std::vector<std::vector<float>> nmsed_classes(num_batches);
   // [num_batches]
-  std::vector<int> final_valid_detections;
+  std::vector<int> final_valid_detections(num_batches);
+
+  auto shard_nms = [&](int begin, int end) {
+    for (int idx = begin; idx < end; ++idx) {
+      int batch_idx = idx / num_classes;
+      int class_idx = idx % num_classes;
+      DoNMSPerClass(batch_idx, class_idx,
+                    boxes_data + boxes_per_batch * batch_idx,
+                    scores_data + scores_per_batch * batch_idx, num_boxes, q,
+                    num_classes, size_per_class, score_threshold, iou_threshold,
+                    result_candidate_vec[batch_idx]);
+    }
+  };
+
+  int length = num_batches * num_classes;
+  // Input data boxes_data, scores_data
+  int input_bytes = num_boxes * 10 * sizeof(float);
+  int output_bytes = num_boxes * 10 * sizeof(float);
+  int compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 14 +
+                       Eigen::TensorOpCost::MulCost<int>() * num_boxes * 9 +
+                       Eigen::TensorOpCost::MulCost<float>() * num_boxes * 9 +
+                       Eigen::TensorOpCost::AddCost<float>() * num_boxes * 8;
+  // The cost here is not the actual number of cycles, but rather a set of
+  // hand-tuned numbers that seem to work best.
+  const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
+  const CPUDevice& d = context->eigen_device<CPUDevice>();
+  d.parallelFor(length, cost, shard_nms);
 
   int per_batch_size = total_size_per_batch;
-
-  // perform non_max_suppression operation for each batch independently
-  for (int batch = 0; batch < num_batches; ++batch) {
-    // dims of per_batch_boxes [num_boxes, q, 4]
-    Tensor per_batch_boxes = inp_boxes.Slice(batch, batch + 1);
-    // dims of per_batch_scores [num_boxes, num_classes]
-    Tensor per_batch_scores = inp_scores.Slice(batch, batch + 1);
-
-    struct ResultCandidate {
-      int box_index;
-      float score;
-      int class_idx;
-      float box_coord[4];
-    };
-
-    std::vector<ResultCandidate> result_candidate_vec;
-
-    float* scores_data = per_batch_scores.unaligned_flat<float>().data();
-    float* boxes_data = per_batch_boxes.unaligned_flat<float>().data();
-
-    // Iterate through all classes
-    for (int class_idx = 0; class_idx < num_classes; ++class_idx) {
-      std::vector<float> class_scores_data;
-      class_scores_data.reserve(num_boxes);
-      std::vector<float> class_boxes_data;
-      class_boxes_data.reserve(num_boxes * 4);
-
-      for (int box = 0; box < num_boxes; ++box) {
-        // Get the scores per class
-        // class_scores_data dim is [num_boxes].
-        class_scores_data.push_back(scores_data[box * num_classes + class_idx]);
-        for (int cid = 0; cid < 4; ++cid) {
-          if (q > 1) {
-            // Get the boxes per class. class_boxes_data dims is [num_boxes, 4]
-            class_boxes_data.push_back(
-                boxes_data[(box * q + class_idx) * 4 + cid]);
-          } else {
-            class_boxes_data.push_back(boxes_data[box * 4 + cid]);
-          }
-        }
-      }
-
-      // Copy class_boxes_data to a tensor
-      TensorShape boxesShape({num_boxes, 4});
-      Tensor boxes(per_batch_boxes.dtype(), boxesShape);
-      std::copy_n(class_boxes_data.begin(), class_boxes_data.size(),
-                  boxes.unaligned_flat<float>().data());
-
-      const int size_per_class = std::min(max_size_per_class, num_boxes);
-      // Do NMS, get the candidate indices of form vector<int>
-      // Data structure for selection candidate in NMS.
-      struct Candidate {
-        int box_index;
-        float score;
-      };
-      auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
-        return bs_i.score > bs_j.score;
-      };
-      std::vector<Candidate> candidate_vector;
-      for (int i = 0; i < class_scores_data.size(); ++i) {
-        if (class_scores_data[i] > score_threshold) {
-          candidate_vector.emplace_back(Candidate({i, class_scores_data[i]}));
-        }
-      }
-
-      std::vector<int> selected;
-      Candidate next_candidate;
-
-      std::sort(candidate_vector.begin(), candidate_vector.end(), cmp);
-      const Tensor const_boxes = boxes;
-      typename TTypes<float, 2>::ConstTensor boxes_data =
-          const_boxes.tensor<float, 2>();
-      int candidate_idx = 0;
-      float iou;
-      while (selected.size() < size_per_class &&
-             candidate_idx < candidate_vector.size()) {
-        next_candidate = candidate_vector[candidate_idx++];
-
-        // Overlapping boxes are likely to have similar scores,
-        // therefore we iterate through the previously selected boxes backwards
-        // in order to see if `next_candidate` should be suppressed.
-        bool should_select = true;
-        for (int j = selected.size() - 1; j >= 0; --j) {
-          iou = IOU<float>(boxes_data, next_candidate.box_index, selected[j]);
-          if (iou > iou_threshold) {
-            should_select = false;
-            break;
-          }
-        }
-
-        if (should_select) {
-          selected.push_back(next_candidate.box_index);
-          // Add the selected box to the result candidate. Sorted by score
-          int id = next_candidate.box_index;
-          ResultCandidate rc = {next_candidate.box_index,
-                                next_candidate.score,
-                                class_idx,
-                                {boxes_data(id, 0), boxes_data(id, 1),
-                                 boxes_data(id, 2), boxes_data(id, 3)}};
-          result_candidate_vec.push_back(rc);
-        }
-      }
-    }
-
-    auto rc_cmp = [](const ResultCandidate rc_i, const ResultCandidate rc_j) {
-      return rc_i.score > rc_j.score;
-    };
-    std::sort(result_candidate_vec.begin(), result_candidate_vec.end(), rc_cmp);
-
-    int max_detections = 0;
-    // If pad_per_class is false, we always pad to max_total_size
-    if (!pad_per_class) {
-      max_detections =
-          std::min((int)result_candidate_vec.size(), total_size_per_batch);
-      per_batch_size = total_size_per_batch;
-    } else {
-      per_batch_size =
-          std::min(total_size_per_batch, max_size_per_class * num_classes);
-      max_detections =
-          std::min(per_batch_size, (int)result_candidate_vec.size());
-    }
-
-    final_valid_detections.push_back(max_detections);
-
-    int curr_total_size = max_detections;
-    int result_idx = 0;
-    // Pick the top max_detections values
-    while (curr_total_size > 0 && result_idx < result_candidate_vec.size()) {
-      ResultCandidate next_candidate = result_candidate_vec[result_idx++];
-      // Add to final output vectors
-      if (clip_boxes) {
-        const float box_min = 0.0;
-        const float box_max = 1.0;
-        nmsed_boxes[batch].push_back(
-            std::max(std::min(next_candidate.box_coord[0], box_max), box_min));
-        nmsed_boxes[batch].push_back(
-            std::max(std::min(next_candidate.box_coord[1], box_max), box_min));
-        nmsed_boxes[batch].push_back(
-            std::max(std::min(next_candidate.box_coord[2], box_max), box_min));
-        nmsed_boxes[batch].push_back(
-            std::max(std::min(next_candidate.box_coord[3], box_max), box_min));
-      } else {
-        nmsed_boxes[batch].push_back(next_candidate.box_coord[0]);
-        nmsed_boxes[batch].push_back(next_candidate.box_coord[1]);
-        nmsed_boxes[batch].push_back(next_candidate.box_coord[2]);
-        nmsed_boxes[batch].push_back(next_candidate.box_coord[3]);
-      }
-      nmsed_scores[batch].push_back(next_candidate.score);
-      nmsed_classes[batch].push_back(next_candidate.class_idx);
-      curr_total_size--;
-    }
-
-    nmsed_boxes[batch].resize(per_batch_size * 4, 0);
-    nmsed_scores[batch].resize(per_batch_size, 0);
-    nmsed_classes[batch].resize(per_batch_size, 0);
+  if (pad_per_class) {
+    per_batch_size =
+        std::min(total_size_per_batch, max_size_per_class * num_classes);
   }
 
+  Tensor* valid_detections_t = nullptr;
+  TensorShape valid_detections_shape({num_batches});
+  OP_REQUIRES_OK(context, context->allocate_output(3, valid_detections_shape,
+                                                   &valid_detections_t));
+  auto valid_detections_flat = valid_detections_t->template flat<int>();
+
+  auto shard_result = [&](int begin, int end) {
+    for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
+      SelectResultPerBatch(
+          nmsed_boxes[batch_idx], nmsed_scores[batch_idx],
+          nmsed_classes[batch_idx], result_candidate_vec[batch_idx],
+          final_valid_detections, batch_idx, total_size_per_batch,
+          pad_per_class, max_size_per_class * num_classes, clip_boxes,
+          per_batch_size);
+      valid_detections_flat(batch_idx) = final_valid_detections[batch_idx];
+    }
+  };
+  length = num_batches;
+  // Input data boxes_data, scores_data
+  input_bytes =
+      num_boxes * 10 * sizeof(float) + per_batch_size * 6 * sizeof(float);
+  output_bytes =
+      num_boxes * 5 * sizeof(float) + per_batch_size * 6 * sizeof(float);
+  compute_cycles = Eigen::TensorOpCost::AddCost<int>() * num_boxes * 5 +
+                   Eigen::TensorOpCost::AddCost<float>() * num_boxes * 5;
+  // The cost here is not the actual number of cycles, but rather a set of
+  // hand-tuned numbers that seem to work best.
+  const Eigen::TensorOpCost cost_result(input_bytes, output_bytes,
+                                        compute_cycles);
+  d.parallelFor(length, cost_result, shard_result);
+
   Tensor* nmsed_boxes_t = nullptr;
   TensorShape boxes_shape({num_batches, per_batch_size, 4});
   OP_REQUIRES_OK(context,
@@ -477,23 +544,27 @@ void BatchedNonMaxSuppressionOp(
                  context->allocate_output(2, scores_shape, &nmsed_classes_t));
   auto nmsed_classes_flat = nmsed_classes_t->template flat<float>();
 
-  Tensor* valid_detections_t = nullptr;
-  TensorShape valid_detections_shape({num_batches});
-  OP_REQUIRES_OK(context, context->allocate_output(3, valid_detections_shape,
-                                                   &valid_detections_t));
-  auto valid_detections_flat = valid_detections_t->template flat<int>();
-
-  for (int i = 0; i < num_batches; ++i) {
-    valid_detections_flat(i) = final_valid_detections[i];
-    for (int j = 0; j < per_batch_size; ++j) {
-      nmsed_scores_flat(i * per_batch_size + j) = nmsed_scores[i][j];
-      nmsed_classes_flat(i * per_batch_size + j) = nmsed_classes[i][j];
+  auto shard_copy_result = [&](int begin, int end) {
+    for (int idx = begin; idx < end; ++idx) {
+      int batch_idx = idx / per_batch_size;
+      int j = idx % per_batch_size;
+      nmsed_scores_flat(idx) = nmsed_scores[batch_idx][j];
+      nmsed_classes_flat(idx) = nmsed_classes[batch_idx][j];
       for (int k = 0; k < 4; ++k) {
-        nmsed_boxes_flat(i * per_batch_size * 4 + j * 4 + k) =
-            nmsed_boxes[i][j * 4 + k];
+        nmsed_boxes_flat(idx * 4 + k) = nmsed_boxes[batch_idx][j * 4 + k];
       }
     }
-  }
+  };
+  length = num_batches * per_batch_size;
+  // Input data boxes_data, scores_data
+  input_bytes = 6 * sizeof(float);
+  output_bytes = 6 * sizeof(float);
+  compute_cycles = Eigen::TensorOpCost::AddCost<int>() * 2 +
+                   Eigen::TensorOpCost::MulCost<int>() * 2 +
+                   Eigen::TensorOpCost::DivCost<float>() * 2;
+  const Eigen::TensorOpCost cost_copy_result(input_bytes, output_bytes,
+                                             compute_cycles);
+  d.parallelFor(length, cost_copy_result, shard_copy_result);
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc b/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
new file mode 100644
index 00000000000..40c8d77ec9d
--- /dev/null
+++ b/tensorflow/core/kernels/non_max_suppression_op_benchmark_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* BM_CombinedNonMaxSuppression(int batches, int box_num,
+                                           int class_num, int q) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor boxes(DT_FLOAT, TensorShape({batches, box_num, q, 4}));
+  boxes.flat<float>().setRandom();
+  Tensor scores(DT_FLOAT, TensorShape({batches, box_num, class_num}));
+  scores.flat<float>().setRandom();
+
+  Tensor max_output_size_per_class(100);
+  Tensor max_total_size(9000);
+  Tensor iou_threshold(float(0.3));
+  Tensor score_threshold(float(0.25));
+
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "CombinedNonMaxSuppression")
+                  .Input(test::graph::Constant(g, boxes))
+                  .Input(test::graph::Constant(g, scores))
+                  .Input(test::graph::Constant(g, max_output_size_per_class))
+                  .Input(test::graph::Constant(g, max_total_size))
+                  .Input(test::graph::Constant(g, iou_threshold))
+                  .Input(test::graph::Constant(g, score_threshold))
+                  .Attr("pad_per_class", false)
+                  .Attr("clip_boxes", true)
+                  .Finalize(g, &ret));
+  return g;
+}
+
+#define BM_CombinedNonMaxSuppressionDev(DEVICE, B, BN, CN, Q)                \
+  static void BM_CombinedNMS_##DEVICE##_##B##_##BN##_##CN##_##Q(int iters) { \
+    testing::ItemsProcessed(iters* B);                                       \
+    test::Benchmark(#DEVICE, BM_CombinedNonMaxSuppression(B, BN, CN, Q))     \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_CombinedNMS_##DEVICE##_##B##_##BN##_##CN##_##Q);
+
+#define BM_Batch(BN, CN, Q)                            \
+  BM_CombinedNonMaxSuppressionDev(cpu, 1, BN, CN, Q);  \
+  BM_CombinedNonMaxSuppressionDev(cpu, 28, BN, CN, Q); \
+  BM_CombinedNonMaxSuppressionDev(cpu, 32, BN, CN, Q); \
+  BM_CombinedNonMaxSuppressionDev(cpu, 64, BN, CN, Q);
+
+#define BN_Boxes_Number(CN, Q) \
+  BM_Batch(500, CN, Q);        \
+  BM_Batch(1000, CN, Q);       \
+  BM_Batch(1917, CN, Q);       \
+  BM_Batch(2500, CN, Q);
+
+BN_Boxes_Number(25, 1);
+BN_Boxes_Number(25, 25);
+BN_Boxes_Number(90, 1);
+BN_Boxes_Number(90, 90);
+BN_Boxes_Number(200, 1);
+BN_Boxes_Number(200, 200);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 3dab8bf2f50..614e184b0b2 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -71,6 +71,9 @@ OpsTestBase::OpsTestBase() : device_type_(DEVICE_CPU) {
   auto device = DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
   CHECK(device) << "Could not create CPU device";
 
+  thread_pool_ = absl::make_unique<thread::ThreadPool>(
+      Env::Default(), /*name=*/"default", /*num_threads=*/1);
+
   device_ = device.get();
   device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(device));
 
@@ -104,7 +107,8 @@ void OpsTestBase::SetDevice(const DeviceType& device_type,
   device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(device));
   pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-      TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions());
+      TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+      thread_pool_.get());
 
   device_type_ = device_type;
 #ifdef GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index ab7b994d9d2..ea79a4b416b 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -183,6 +184,7 @@ class OpsTestBase : public ::testing::Test {
 
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(OpsTestBase);
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index a16e34c7cb4..c7c538a945f 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -308,8 +308,8 @@ class SparseCrossOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->input_list("dense_inputs", &dense_list_in));
 
-    ValidateInput(context, indices_list_in, values_list_in, shapes_list_in,
-                  dense_list_in);
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
 
     std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
         GenerateColumnsFromInput(indices_list_in, values_list_in,
@@ -322,8 +322,10 @@ class SparseCrossOp : public OpKernel {
     Tensor* shape_out;
     const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
     std::vector<int64> output_start_indices(batch_size);
-    CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
-                        &shape_out, &output_start_indices);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
 
     typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
         output_start_indices, indices_out, values_out);
@@ -348,83 +350,93 @@ class SparseCrossOp : public OpKernel {
 
  private:
   // Validates input tensors.
-  void ValidateInput(OpKernelContext* context,
-                     const OpInputList& indices_list_in,
-                     const OpInputList& values_list_in,
-                     const OpInputList& shapes_list_in,
-                     const OpInputList& dense_list_in) {
+  Status ValidateInput(const OpInputList& indices_list_in,
+                       const OpInputList& values_list_in,
+                       const OpInputList& shapes_list_in,
+                       const OpInputList& dense_list_in) {
     const auto size = indices_list_in.size();
     // Validates indices_list_in OpInputList.
     for (int i = 0; i < size; i++) {
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsMatrix(indices_list_in[i].shape()),
-          errors::InvalidArgument(
-              "Input indices should be a matrix but received shape ",
-              indices_list_in[i].shape().DebugString(), " at position ", i));
-      OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(1) == 2,
-          errors::InvalidArgument("Expected D2 of index to be 2 got ",
-                                  indices_list_in[i].shape().dim_size(1),
-                                  " at position ", i));
+      if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
+        return errors::InvalidArgument(
+            "Input indices should be a matrix but received shape ",
+            indices_list_in[i].shape().DebugString(), " at position ", i);
+      }
+      if (indices_list_in[i].shape().dim_size(1) != 2) {
+        return errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                       indices_list_in[i].shape().dim_size(1),
+                                       " at position ", i);
+      }
     }
 
     // Validates values_list_in OpInputList.
-    OP_REQUIRES(
-        context, values_list_in.size() == size,
-        errors::InvalidArgument("Expected ", size, " input values, got ",
-                                values_list_in.size()));
+    if (values_list_in.size() != size) {
+      return errors::InvalidArgument("Expected ", size, " input values, got ",
+                                     values_list_in.size());
+    }
     for (int i = 0; i < size; i++) {
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(values_list_in[i].shape()),
-          errors::InvalidArgument(
-              "Input values should be a std::vector but received shape ",
-              values_list_in[i].shape().DebugString(), " at position ", i));
-      OP_REQUIRES(
-          context,
-          indices_list_in[i].shape().dim_size(0) ==
-              values_list_in[i].shape().dim_size(0),
-          errors::InvalidArgument(
-              "Expected size of values to be ",
-              indices_list_in[i].shape().dim_size(0), " got ",
-              values_list_in[i].shape().dim_size(0), " at position ", i));
+      if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
+        return errors::InvalidArgument(
+            "Input values should be a vector but received shape ",
+            values_list_in[i].shape().DebugString(), " at position ", i);
+      }
+      if (indices_list_in[i].shape().dim_size(0) !=
+          values_list_in[i].shape().dim_size(0)) {
+        return errors::InvalidArgument(
+            "Expected size of values to be ",
+            indices_list_in[i].shape().dim_size(0), " got ",
+            values_list_in[i].shape().dim_size(0), " at position ", i);
+      }
     }
 
     // Validates shapes_list_in OpInputList
-    OP_REQUIRES(
-        context, shapes_list_in.size() == size,
-        errors::InvalidArgument("Expected ", size, " input shapes, got ",
-                                shapes_list_in.size()));
-    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    if (shapes_list_in.size() != size) {
+      return errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                     shapes_list_in.size());
+    }
     for (int i = 0; i < size; i++) {
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(shapes_list_in[i].shape()),
-          errors::InvalidArgument(
-              "Input shapes should be a std::vector but received shape ",
-              shapes_list_in[i].shape().DebugString(), " at position ", i));
+      if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
+        return errors::InvalidArgument(
+            "Input shapes should be a vector but received shape ",
+            shapes_list_in[i].shape().DebugString(), " at position ", i);
+      }
 
-      OP_REQUIRES(
-          context, shapes_list_in[i].vec<int64>().size() == 2,
-          errors::InvalidArgument("shape should imply a 2D tensor, but got ",
-                                  shapes_list_in[i].shape().DebugString(),
-                                  " at position ", i));
-      OP_REQUIRES(context, shapes_list_in[i].vec<int64>()(0) == batch_size,
-                  errors::InvalidArgument(
-                      "Expected batch size ", batch_size, " got ",
-                      shapes_list_in[i].vec<int64>()(0), " at position ", i));
+      if (shapes_list_in[i].vec<int64>().size() != 2) {
+        return errors::InvalidArgument(
+            "shape should imply a 2D tensor, but got ",
+            shapes_list_in[i].shape().DebugString(), " at position ", i);
+      }
     }
 
     // Validates dense_list_in OpInputList
     for (int i = 0; i < dense_list_in.size(); ++i) {
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
-          errors::InvalidArgument(
-              "Dense inputs should be a matrix but received shape ",
-              dense_list_in[i].shape().DebugString(), " at position ", i));
-      OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
-                  errors::InvalidArgument("Expected batch size ", batch_size,
-                                          " got ", dense_list_in[i].dim_size(0),
-                                          " at dense tensor ", i));
+      if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
+        return errors::InvalidArgument(
+            "Dense inputs should be a matrix but received shape ",
+            dense_list_in[i].shape().DebugString(), " at position ", i);
+      }
     }
+
+    // Validates batch sizes.  (Note: we do this after validating the input
+    // shapes, because CalculateBatchSize() depends on inputs having valid
+    // shapes).
+    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    for (int i = 0; i < size; i++) {
+      if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
+        return errors::InvalidArgument(
+            "Expected batch size ", batch_size, " got ",
+            shapes_list_in[i].vec<int64>()(0), " at position ", i);
+      }
+    }
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      if (dense_list_in[i].dim_size(0) != batch_size) {
+        return errors::InvalidArgument("Expected batch size ", batch_size,
+                                       " got ", dense_list_in[i].dim_size(0),
+                                       " at dense tensor ", i);
+      }
+    }
+
+    return Status::OK();
   }
 
   // Calculate the batch size from either the shapes input or the dense input.
@@ -500,7 +512,7 @@ class SparseCrossOp : public OpKernel {
   // the output SparseTensor.
   // It also output_start_indices which contains the start indices for each
   // input in the output SparseTensor.
-  void CreateOutputTensors(
+  Status CreateOutputTensors(
       const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
           columns,
       int64 batch_size, OpKernelContext* context, Tensor** indices_out,
@@ -518,19 +530,19 @@ class SparseCrossOp : public OpKernel {
     }
 
     // Allocates tensors.
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({cross_count_total, 2}), indices_out));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, TensorShape({cross_count_total}),
-                                            values_out));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(2, TensorShape({2}), shape_out));
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        0, TensorShape({cross_count_total, 2}), indices_out));
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        1, TensorShape({cross_count_total}), values_out));
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({2}), shape_out));
 
     // Sets shape.
     auto shape_vec = (*shape_out)->vec<int64>();
     shape_vec(0) = batch_size;
     shape_vec(1) = max_cross_count;
+
+    return Status::OK();
   }
 
   // Returns number of crosses for a given batch_index
diff --git a/tensorflow/core/kernels/spectrogram_test.cc b/tensorflow/core/kernels/spectrogram_test.cc
index 73175a91a00..2d50d833d03 100644
--- a/tensorflow/core/kernels/spectrogram_test.cc
+++ b/tensorflow/core/kernels/spectrogram_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/spectrogram_test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -30,17 +32,24 @@ namespace tensorflow {
 
 using ::std::complex;
 
-const char kInputFilename[] =
-    "core/kernels/spectrogram_test_data/short_test_segment.wav";
+string InputFilename() {
+  return io::JoinPath("tensorflow", "core", "kernels", "spectrogram_test_data",
+                      "short_test_segment.wav");
+}
+
+string ExpectedFilename() {
+  return io::JoinPath("tensorflow", "core", "kernels", "spectrogram_test_data",
+                      "short_test_segment_spectrogram.csv.bin");
+}
 
-const char kExpectedFilename[] =
-    "core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin";
 const int kDataVectorLength = 257;
 const int kNumberOfFramesInTestData = 178;
 
-const char kExpectedNonPowerOfTwoFilename[] =
-    "core/kernels/spectrogram_test_data/"
-    "short_test_segment_spectrogram_400_200.csv.bin";
+string ExpectedNonPowerOfTwoFilename() {
+  return io::JoinPath("tensorflow", "core", "kernels", "spectrogram_test_data",
+                      "short_test_segment_spectrogram_400_200.csv.bin");
+}
+
 const int kNonPowerOfTwoDataVectorLength = 257;
 const int kNumberOfFramesInNonPowerOfTwoTestData = 228;
 
@@ -206,9 +215,8 @@ TEST(SpectrogramTest, ReInitializationWorks) {
   Spectrogram sgram;
   sgram.Initialize(512, 256);
   std::vector<double> input;
-  CHECK(ReadWaveFileToVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
-      &input));
+  CHECK(
+      ReadWaveFileToVector(GetDataDependencyFilepath(InputFilename()), &input));
   std::vector<std::vector<complex<double>>> first_output;
   std::vector<std::vector<complex<double>>> second_output;
   sgram.Initialize(512, 256);
@@ -233,14 +241,13 @@ TEST(SpectrogramTest, ComputedComplexDataAgreeWithMatlab) {
   Spectrogram sgram;
   sgram.Initialize(512, 256);
   std::vector<double> input;
-  CHECK(ReadWaveFileToVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
-      &input));
+  CHECK(
+      ReadWaveFileToVector(GetDataDependencyFilepath(InputFilename()), &input));
   EXPECT_EQ(kInputDataLength, input.size());
   std::vector<std::vector<complex<double>>> expected_output;
   ASSERT_TRUE(ReadRawFloatFileToComplexVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
-      kDataVectorLength, &expected_output));
+      GetDataDependencyFilepath(ExpectedFilename()), kDataVectorLength,
+      &expected_output));
   EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
   EXPECT_EQ(kDataVectorLength, expected_output[0].size());
   std::vector<std::vector<complex<double>>> output;
@@ -253,16 +260,15 @@ TEST(SpectrogramTest, ComputedFloatComplexDataAgreeWithMatlab) {
   Spectrogram sgram;
   sgram.Initialize(512, 256);
   std::vector<double> double_input;
-  CHECK(ReadWaveFileToVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
-      &double_input));
+  CHECK(ReadWaveFileToVector(GetDataDependencyFilepath(InputFilename()),
+                             &double_input));
   std::vector<float> input;
   input.assign(double_input.begin(), double_input.end());
   EXPECT_EQ(kInputDataLength, input.size());
   std::vector<std::vector<complex<double>>> expected_output;
   ASSERT_TRUE(ReadRawFloatFileToComplexVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
-      kDataVectorLength, &expected_output));
+      GetDataDependencyFilepath(ExpectedFilename()), kDataVectorLength,
+      &expected_output));
   EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
   EXPECT_EQ(kDataVectorLength, expected_output[0].size());
   std::vector<std::vector<complex<float>>> output;
@@ -275,14 +281,13 @@ TEST(SpectrogramTest, ComputedSquaredMagnitudeDataAgreeWithMatlab) {
   Spectrogram sgram;
   sgram.Initialize(512, 256);
   std::vector<double> input;
-  CHECK(ReadWaveFileToVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
-      &input));
+  CHECK(
+      ReadWaveFileToVector(GetDataDependencyFilepath(InputFilename()), &input));
   EXPECT_EQ(kInputDataLength, input.size());
   std::vector<std::vector<complex<double>>> expected_output;
   ASSERT_TRUE(ReadRawFloatFileToComplexVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
-      kDataVectorLength, &expected_output));
+      GetDataDependencyFilepath(ExpectedFilename()), kDataVectorLength,
+      &expected_output));
   EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
   EXPECT_EQ(kDataVectorLength, expected_output[0].size());
   std::vector<std::vector<double>> output;
@@ -295,16 +300,15 @@ TEST(SpectrogramTest, ComputedFloatSquaredMagnitudeDataAgreeWithMatlab) {
   Spectrogram sgram;
   sgram.Initialize(512, 256);
   std::vector<double> double_input;
-  CHECK(ReadWaveFileToVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
-      &double_input));
+  CHECK(ReadWaveFileToVector(GetDataDependencyFilepath(InputFilename()),
+                             &double_input));
   EXPECT_EQ(kInputDataLength, double_input.size());
   std::vector<float> input;
   input.assign(double_input.begin(), double_input.end());
   std::vector<std::vector<complex<double>>> expected_output;
   ASSERT_TRUE(ReadRawFloatFileToComplexVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
-      kDataVectorLength, &expected_output));
+      GetDataDependencyFilepath(ExpectedFilename()), kDataVectorLength,
+      &expected_output));
   EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
   EXPECT_EQ(kDataVectorLength, expected_output[0].size());
   std::vector<std::vector<float>> output;
@@ -321,14 +325,12 @@ TEST(SpectrogramTest, ComputedNonPowerOfTwoComplexDataAgreeWithMatlab) {
   Spectrogram sgram;
   sgram.Initialize(400, 200);
   std::vector<double> input;
-  CHECK(ReadWaveFileToVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
-      &input));
+  CHECK(
+      ReadWaveFileToVector(GetDataDependencyFilepath(InputFilename()), &input));
   EXPECT_EQ(kInputDataLength, input.size());
   std::vector<std::vector<complex<double>>> expected_output;
   ASSERT_TRUE(ReadRawFloatFileToComplexVector(
-      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(),
-                               kExpectedNonPowerOfTwoFilename),
+      GetDataDependencyFilepath(ExpectedNonPowerOfTwoFilename()),
       kNonPowerOfTwoDataVectorLength, &expected_output));
   EXPECT_EQ(kNumberOfFramesInNonPowerOfTwoTestData, expected_output.size());
   EXPECT_EQ(kNonPowerOfTwoDataVectorLength, expected_output[0].size());
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 318894bfce4..598cb526d77 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -75,7 +75,7 @@ template <typename T>
 struct NumTrue<CPUDevice, T, int64> {
   static Status Compute(OpKernelContext* ctx, const CPUDevice& d,
                         typename TTypes<T>::ConstFlat input,
-                        TTypes<int64>::Scalar num_true) {
+                        TTypes<int64>::UnalignedScalar num_true) {
     num_true() = CountAccumulator<T>(input.data(), input.data() + input.size());
     return Status::OK();
   }
@@ -140,18 +140,14 @@ class WhereCPUOp : public OpKernel {
 
     const int input_dims = input.dims();
 
-    Tensor num_true;
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
-                                                   &num_true, attr));
-    auto num_true_t = num_true.scalar<int64>();
+    int64 num_true;
+    TTypes<int64>::UnalignedScalar num_true_t(&num_true);
 
     Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
         context, context->eigen_device<CPUDevice>(), input.flat<T>(),
         num_true_t);
     OP_REQUIRES_OK(context, s);
-    TensorShape output_shape({num_true_t(), input_dims});
+    TensorShape output_shape({num_true, input_dims});
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
@@ -216,7 +212,7 @@ namespace functor {
   template <>                                                               \
   Status NumTrue<GPUDevice, T, Tindex>::Compute(                            \
       OpKernelContext* ctx, const GPUDevice& d, TTypes<T>::ConstFlat input, \
-      TTypes<Tindex>::Scalar num_true);                                     \
+      TTypes<Tindex>::UnalignedScalar num_true);                            \
   extern template struct NumTrue<GPUDevice, T, Tindex>
 
 #define DECLARE_GPU_NUMTRUE_TYPE(T) \
@@ -287,8 +283,8 @@ class WhereGPUOp : public AsyncOpKernel {
                          context->allocate_temp(DataTypeToEnum<Tindex>::v(),
                                                 TensorShape({}), &num_true),
                          done);
-
-    auto num_true_t = num_true.scalar<Tindex>();
+    typename TTypes<Tindex>::UnalignedScalar num_true_t(
+        num_true.scalar<Tindex>().data());
 
     se::DeviceMemoryBase num_true_ptr(static_cast<void*>(num_true_t.data()));
     // Push kernel to stream to get number of true elements.
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index 7297d37ffb8..58d38139f3a 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -41,7 +41,7 @@ struct NumTrue {
   EIGEN_ALWAYS_INLINE static Status Compute(
       OpKernelContext* ctx, const Device& d,
       typename TTypes<T>::ConstFlat input,
-      typename TTypes<TIndex>::Scalar num_true);
+      typename TTypes<TIndex>::UnalignedScalar num_true);
 };
 
 template <typename Device, int NDIM, typename T, typename TIndex>
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 3795733f959..f13f504c1d7 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -149,7 +149,7 @@ struct NumTrue<GPUDevice, T, TIndex> {
   EIGEN_ALWAYS_INLINE static Status Compute(
       OpKernelContext* ctx, const GPUDevice& d,
       typename TTypes<T>::ConstFlat input,
-      typename TTypes<TIndex>::Scalar num_true) {
+      typename TTypes<TIndex>::UnalignedScalar num_true) {
     const auto& cu_stream = GetGpuStream(ctx);
 
     std::size_t temp_storage_bytes = 0;
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index a133f7e0f17..a25f4d947ed 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cmath>
 #include <complex>
+#include <iostream>
 
 #include "tensorflow/core/platform/byte_order.h"
 
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index 87b5090a59f..d03a895b429 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -208,6 +208,21 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "cache",
+    srcs = [
+        "cache.cc",
+    ],
+    hdrs = [
+        "cache.h",
+    ],
+    deps = [
+        "//tensorflow/core/platform:coding",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:stringpiece",
+    ],
+)
+
 cc_library(
     name = "table",
     srcs = [
@@ -220,6 +235,7 @@ cc_library(
     ],
     deps = [
         ":block",
+        ":cache",
         ":iterator",
         ":table_options",
         "//tensorflow/core/lib/core:coding",
@@ -290,6 +306,8 @@ filegroup(
         "block_builder.h",
         "buffered_inputstream.cc",
         "buffered_inputstream.h",
+        "cache.cc",
+        "cache.h",
         "compression.cc",
         "compression.h",
         "format.cc",
@@ -352,6 +370,7 @@ filegroup(
     name = "legacy_lib_io_all_tests",
     srcs = [
         "buffered_inputstream_test.cc",
+        "cache_test.cc",
         "inputbuffer_test.cc",
         "inputstream_interface_test.cc",
         "path_test.cc",
@@ -369,6 +388,7 @@ filegroup(
     name = "legacy_lib_io_headers",
     srcs = [
         "buffered_inputstream.h",
+        "cache.h",
         "compression.h",
         "inputstream_interface.h",
         "path.h",
diff --git a/tensorflow/core/lib/io/cache.cc b/tensorflow/core/lib/io/cache.cc
new file mode 100644
index 00000000000..b5521b1752b
--- /dev/null
+++ b/tensorflow/core/lib/io/cache.cc
@@ -0,0 +1,450 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/cache.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace table {
+
+Cache::~Cache() {}
+
+namespace {
+
+// LRU cache implementation
+//
+// Cache entries have an "in_cache" boolean indicating whether the cache has a
+// reference on the entry.  The only ways that this can become false without the
+// entry being passed to its "deleter" are via Erase(), via Insert() when
+// an element with a duplicate key is inserted, or on destruction of the cache.
+//
+// The cache keeps two linked lists of items in the cache.  All items in the
+// cache are in one list or the other, and never both.  Items still referenced
+// by clients but erased from the cache are in neither list.  The lists are:
+// - in-use:  contains the items currently referenced by clients, in no
+//   particular order.  (This list is used for invariant checking.  If we
+//   removed the check, elements that would otherwise be on this list could be
+//   left as disconnected singleton lists.)
+// - LRU:  contains the items not currently referenced by clients, in LRU order
+// Elements are moved between these lists by the Ref() and Unref() methods,
+// when they detect an element in the cache acquiring or losing its only
+// external reference.
+
+// An entry is a variable length heap-allocated structure.  Entries
+// are kept in a circular doubly linked list ordered by access time.
+struct LRUHandle {
+  void* value;
+  void (*deleter)(const Slice&, void* value);
+  LRUHandle* next_hash;
+  LRUHandle* next;
+  LRUHandle* prev;
+  size_t charge;  // TODO(opt): Only allow uint32_t?
+  size_t key_length;
+  bool in_cache;     // Whether entry is in the cache.
+  uint32_t refs;     // References, including cache reference, if present.
+  uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
+  char key_data[1];  // Beginning of key
+
+  Slice key() const {
+    // next_ is only equal to this if the LRU handle is the list head of an
+    // empty list. List heads never have meaningful keys.
+    assert(next != this);
+
+    return Slice(key_data, key_length);
+  }
+};
+
+// We provide our own simple hash table since it removes a whole bunch
+// of porting hacks and is also faster than some of the built-in hash
+// table implementations in some of the compiler/runtime combinations
+// we have tested.  E.g., readrandom speeds up by ~5% over the g++
+// 4.4.3's builtin hashtable.
+class HandleTable {
+ public:
+  HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
+  ~HandleTable() { delete[] list_; }
+
+  LRUHandle* Lookup(const Slice& key, uint32_t hash) {
+    return *FindPointer(key, hash);
+  }
+
+  LRUHandle* Insert(LRUHandle* h) {
+    LRUHandle** ptr = FindPointer(h->key(), h->hash);
+    LRUHandle* old = *ptr;
+    h->next_hash = (old == nullptr ? nullptr : old->next_hash);
+    *ptr = h;
+    if (old == nullptr) {
+      ++elems_;
+      if (elems_ > length_) {
+        // Since each cache entry is fairly large, we aim for a small
+        // average linked list length (<= 1).
+        Resize();
+      }
+    }
+    return old;
+  }
+
+  LRUHandle* Remove(const Slice& key, uint32_t hash) {
+    LRUHandle** ptr = FindPointer(key, hash);
+    LRUHandle* result = *ptr;
+    if (result != nullptr) {
+      *ptr = result->next_hash;
+      --elems_;
+    }
+    return result;
+  }
+
+ private:
+  // The table consists of an array of buckets where each bucket is
+  // a linked list of cache entries that hash into the bucket.
+  uint32_t length_;
+  uint32_t elems_;
+  LRUHandle** list_;
+
+  // Return a pointer to slot that points to a cache entry that
+  // matches key/hash.  If there is no such cache entry, return a
+  // pointer to the trailing slot in the corresponding linked list.
+  LRUHandle** FindPointer(const Slice& key, uint32_t hash) {
+    LRUHandle** ptr = &list_[hash & (length_ - 1)];
+    while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
+      ptr = &(*ptr)->next_hash;
+    }
+    return ptr;
+  }
+
+  void Resize() {
+    uint32_t new_length = 4;
+    while (new_length < elems_) {
+      new_length *= 2;
+    }
+    LRUHandle** new_list = new LRUHandle*[new_length];
+    memset(new_list, 0, sizeof(new_list[0]) * new_length);
+    uint32_t count = 0;
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        LRUHandle* next = h->next_hash;
+        uint32_t hash = h->hash;
+        LRUHandle** ptr = &new_list[hash & (new_length - 1)];
+        h->next_hash = *ptr;
+        *ptr = h;
+        h = next;
+        count++;
+      }
+    }
+    assert(elems_ == count);
+    delete[] list_;
+    list_ = new_list;
+    length_ = new_length;
+  }
+};
+
+// A single shard of sharded cache.
+class LRUCache {
+ public:
+  LRUCache();
+  ~LRUCache();
+
+  // Separate from constructor so caller can easily make an array of LRUCache
+  void SetCapacity(size_t capacity) { capacity_ = capacity; }
+
+  // Like Cache methods, but with an extra "hash" parameter.
+  Cache::Handle* Insert(const Slice& key, uint32_t hash, void* value,
+                        size_t charge,
+                        void (*deleter)(const Slice& key, void* value));
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash);
+  void Release(Cache::Handle* handle);
+  void Erase(const Slice& key, uint32_t hash);
+  void Prune();
+  size_t TotalCharge() const {
+    mutex_lock l(mutex_);
+    return usage_;
+  }
+
+ private:
+  void LRU_Remove(LRUHandle* e);
+  void LRU_Append(LRUHandle* list, LRUHandle* e);
+  void Ref(LRUHandle* e);
+  void Unref(LRUHandle* e);
+  bool FinishErase(LRUHandle* e) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Initialized before use.
+  size_t capacity_;
+
+  // mutex_ protects the following state.
+  mutable mutex mutex_;
+  size_t usage_ GUARDED_BY(mutex_);
+
+  // Dummy head of LRU list.
+  // lru.prev is newest entry, lru.next is oldest entry.
+  // Entries have refs==1 and in_cache==true.
+  LRUHandle lru_ GUARDED_BY(mutex_);
+
+  // Dummy head of in-use list.
+  // Entries are in use by clients, and have refs >= 2 and in_cache==true.
+  LRUHandle in_use_ GUARDED_BY(mutex_);
+
+  HandleTable table_ GUARDED_BY(mutex_);
+};
+
+LRUCache::LRUCache() : capacity_(0), usage_(0) {
+  // Make empty circular linked lists.
+  lru_.next = &lru_;
+  lru_.prev = &lru_;
+  in_use_.next = &in_use_;
+  in_use_.prev = &in_use_;
+}
+
+LRUCache::~LRUCache() {
+  assert(in_use_.next == &in_use_);  // Error if caller has an unreleased handle
+  for (LRUHandle* e = lru_.next; e != &lru_;) {
+    LRUHandle* next = e->next;
+    assert(e->in_cache);
+    e->in_cache = false;
+    assert(e->refs == 1);  // Invariant of lru_ list.
+    Unref(e);
+    e = next;
+  }
+}
+
+void LRUCache::Ref(LRUHandle* e) {
+  if (e->refs == 1 && e->in_cache) {  // If on lru_ list, move to in_use_ list.
+    LRU_Remove(e);
+    LRU_Append(&in_use_, e);
+  }
+  e->refs++;
+}
+
+void LRUCache::Unref(LRUHandle* e) {
+  assert(e->refs > 0);
+  e->refs--;
+  if (e->refs == 0) {  // Deallocate.
+    assert(!e->in_cache);
+    (*e->deleter)(e->key(), e->value);
+    free(e);
+  } else if (e->in_cache && e->refs == 1) {
+    // No longer in use; move to lru_ list.
+    LRU_Remove(e);
+    LRU_Append(&lru_, e);
+  }
+}
+
+void LRUCache::LRU_Remove(LRUHandle* e) {
+  e->next->prev = e->prev;
+  e->prev->next = e->next;
+}
+
+void LRUCache::LRU_Append(LRUHandle* list, LRUHandle* e) {
+  // Make "e" newest entry by inserting just before *list
+  e->next = list;
+  e->prev = list->prev;
+  e->prev->next = e;
+  e->next->prev = e;
+}
+
+Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
+  mutex_lock l(mutex_);
+  LRUHandle* e = table_.Lookup(key, hash);
+  if (e != nullptr) {
+    Ref(e);
+  }
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+void LRUCache::Release(Cache::Handle* handle) {
+  mutex_lock l(mutex_);
+  Unref(reinterpret_cast<LRUHandle*>(handle));
+}
+
+Cache::Handle* LRUCache::Insert(const Slice& key, uint32_t hash, void* value,
+                                size_t charge,
+                                void (*deleter)(const Slice& key,
+                                                void* value)) {
+  mutex_lock l(mutex_);
+
+  LRUHandle* e =
+      reinterpret_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
+  e->value = value;
+  e->deleter = deleter;
+  e->charge = charge;
+  e->key_length = key.size();
+  e->hash = hash;
+  e->in_cache = false;
+  e->refs = 1;  // for the returned handle.
+  memcpy(e->key_data, key.data(), key.size());
+
+  if (capacity_ > 0) {
+    e->refs++;  // for the cache's reference.
+    e->in_cache = true;
+    LRU_Append(&in_use_, e);
+    usage_ += charge;
+    FinishErase(table_.Insert(e));
+  } else {  // don't cache. (capacity_==0 is supported and turns off caching.)
+    // next is read by key() in an assert, so it must be initialized
+    e->next = nullptr;
+  }
+  while (usage_ > capacity_ && lru_.next != &lru_) {
+    LRUHandle* old = lru_.next;
+    assert(old->refs == 1);
+    bool erased = FinishErase(table_.Remove(old->key(), old->hash));
+    if (!erased) {  // to avoid unused variable when compiled NDEBUG
+      assert(erased);
+    }
+  }
+
+  return reinterpret_cast<Cache::Handle*>(e);
+}
+
+// If e != nullptr, finish removing *e from the cache; it has already been
+// removed from the hash table.  Return whether e != nullptr.
+bool LRUCache::FinishErase(LRUHandle* e) {
+  if (e != nullptr) {
+    assert(e->in_cache);
+    LRU_Remove(e);
+    e->in_cache = false;
+    usage_ -= e->charge;
+    Unref(e);
+  }
+  return e != nullptr;
+}
+
+void LRUCache::Erase(const Slice& key, uint32_t hash) {
+  mutex_lock l(mutex_);
+  FinishErase(table_.Remove(key, hash));
+}
+
+void LRUCache::Prune() {
+  mutex_lock l(mutex_);
+  while (lru_.next != &lru_) {
+    LRUHandle* e = lru_.next;
+    assert(e->refs == 1);
+    bool erased = FinishErase(table_.Remove(e->key(), e->hash));
+    if (!erased) {  // to avoid unused variable when compiled NDEBUG
+      assert(erased);
+    }
+  }
+}
+
+static const int kNumShardBits = 4;
+static const int kNumShards = 1 << kNumShardBits;
+
+class ShardedLRUCache : public Cache {
+ private:
+  LRUCache shard_[kNumShards];
+  mutex id_mutex_;
+  uint64_t last_id_;
+
+  static inline uint32_t HashSlice(const Slice& s) {
+    return Hash(s.data(), s.size(), 0);
+  }
+
+  static uint32_t Shard(uint32_t hash) { return hash >> (32 - kNumShardBits); }
+
+ public:
+  explicit ShardedLRUCache(size_t capacity) : last_id_(0) {
+    const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards;
+    for (int s = 0; s < kNumShards; s++) {
+      shard_[s].SetCapacity(per_shard);
+    }
+  }
+  ~ShardedLRUCache() override {}
+  Handle* Insert(const Slice& key, void* value, size_t charge,
+                 void (*deleter)(const Slice& key, void* value)) override {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Insert(key, hash, value, charge, deleter);
+  }
+  Handle* Lookup(const Slice& key) override {
+    const uint32_t hash = HashSlice(key);
+    return shard_[Shard(hash)].Lookup(key, hash);
+  }
+  void Release(Handle* handle) override {
+    LRUHandle* h = reinterpret_cast<LRUHandle*>(handle);
+    shard_[Shard(h->hash)].Release(handle);
+  }
+  void Erase(const Slice& key) override {
+    const uint32_t hash = HashSlice(key);
+    shard_[Shard(hash)].Erase(key, hash);
+  }
+  void* Value(Handle* handle) override {
+    return reinterpret_cast<LRUHandle*>(handle)->value;
+  }
+  uint64_t NewId() override {
+    mutex_lock l(id_mutex_);
+    return ++(last_id_);
+  }
+  void Prune() override {
+    for (int s = 0; s < kNumShards; s++) {
+      shard_[s].Prune();
+    }
+  }
+  size_t TotalCharge() const override {
+    size_t total = 0;
+    for (int s = 0; s < kNumShards; s++) {
+      total += shard_[s].TotalCharge();
+    }
+    return total;
+  }
+
+ private:
+  // TODO(byronyi): Figure out why Hash32 fails EvictionPolicy test.
+  static uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+    // Similar to murmur hash
+    const uint32_t m = 0xc6a4a793;
+    const uint32_t r = 24;
+    const char* limit = data + n;
+    uint32_t h = seed ^ (n * m);
+
+    // Pick up four bytes at a time
+    while (data + 4 <= limit) {
+      uint32_t w = core::DecodeFixed32(data);
+      data += 4;
+      h += w;
+      h *= m;
+      h ^= (h >> 16);
+    }
+
+    // Pick up remaining bytes
+    switch (limit - data) {
+      case 3:
+        h += static_cast<uint8_t>(data[2]) << 16;
+        ABSL_FALLTHROUGH_INTENDED;
+      case 2:
+        h += static_cast<uint8_t>(data[1]) << 8;
+        ABSL_FALLTHROUGH_INTENDED;
+      case 1:
+        h += static_cast<uint8_t>(data[0]);
+        h *= m;
+        h ^= (h >> r);
+        break;
+    }
+    return h;
+  }
+};
+
+}  // end anonymous namespace
+
+Cache* NewLRUCache(size_t capacity) { return new ShardedLRUCache(capacity); }
+
+}  // namespace table
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/cache.h b/tensorflow/core/lib/io/cache.h
new file mode 100644
index 00000000000..788a637077a
--- /dev/null
+++ b/tensorflow/core/lib/io/cache.h
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_CACHE_H_
+#define TENSORFLOW_CORE_LIB_IO_CACHE_H_
+
+#include "tensorflow/core/platform/stringpiece.h"
+
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+namespace tensorflow {
+
+using Slice = StringPiece;
+
+namespace table {
+
+class Cache;
+
+// Create a new cache with a fixed size capacity.  This implementation
+// of Cache uses a least-recently-used eviction policy.
+Cache* NewLRUCache(size_t capacity);
+
+class Cache {
+ public:
+  Cache() = default;
+
+  Cache(const Cache&) = delete;
+  Cache& operator=(const Cache&) = delete;
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle {};
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // Remove all cache entries that are not actively in use.  Memory-constrained
+  // applications may wish to call this method to reduce memory usage.
+  // Default implementation of Prune() does nothing.  Subclasses are strongly
+  // encouraged to override the default implementation.  A future release of
+  // leveldb may change Prune() to a pure abstract method.
+  virtual void Prune() {}
+
+  // Return an estimate of the combined charges of all elements stored in the
+  // cache.
+  virtual size_t TotalCharge() const = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+};
+
+}  // namespace table
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_CACHE_H_
diff --git a/tensorflow/core/lib/io/cache_test.cc b/tensorflow/core/lib/io/cache_test.cc
new file mode 100644
index 00000000000..38552d43b34
--- /dev/null
+++ b/tensorflow/core/lib/io/cache_test.cc
@@ -0,0 +1,238 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/cache.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace table {
+// Conversions between numeric keys/values and the types expected by Cache.
+static std::string EncodeKey(int k) {
+  std::string result;
+  core::PutFixed32(&result, k);
+  return result;
+}
+static int DecodeKey(const Slice& k) {
+  assert(k.size() == 4);
+  return core::DecodeFixed32(k.data());
+}
+static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+
+class CacheTest : public ::testing::Test {
+ public:
+  static void Deleter(const Slice& key, void* v) {
+    current_->deleted_keys_.push_back(DecodeKey(key));
+    current_->deleted_values_.push_back(DecodeValue(v));
+  }
+
+  static const int kCacheSize = 1000;
+  std::vector<int> deleted_keys_;
+  std::vector<int> deleted_values_;
+  Cache* cache_;
+
+  CacheTest() : cache_(NewLRUCache(kCacheSize)) { current_ = this; }
+
+  ~CacheTest() { delete cache_; }
+
+  int Lookup(int key) {
+    Cache::Handle* handle = cache_->Lookup(EncodeKey(key));
+    const int r = (handle == nullptr) ? -1 : DecodeValue(cache_->Value(handle));
+    if (handle != nullptr) {
+      cache_->Release(handle);
+    }
+    return r;
+  }
+
+  void Insert(int key, int value, int charge = 1) {
+    cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
+                                   &CacheTest::Deleter));
+  }
+
+  Cache::Handle* InsertAndReturnHandle(int key, int value, int charge = 1) {
+    return cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
+                          &CacheTest::Deleter);
+  }
+
+  void Erase(int key) { cache_->Erase(EncodeKey(key)); }
+  static CacheTest* current_;
+};
+CacheTest* CacheTest::current_;
+
+TEST_F(CacheTest, HitAndMiss) {
+  ASSERT_EQ(-1, Lookup(100));
+
+  Insert(100, 101);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+  ASSERT_EQ(-1, Lookup(300));
+
+  Insert(200, 201);
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1, Lookup(300));
+
+  Insert(100, 102);
+  ASSERT_EQ(102, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(-1, Lookup(300));
+
+  ASSERT_EQ(1, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST_F(CacheTest, Erase) {
+  Erase(200);
+  ASSERT_EQ(0, deleted_keys_.size());
+
+  Insert(100, 101);
+  Insert(200, 201);
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(201, Lookup(200));
+  ASSERT_EQ(1, deleted_keys_.size());
+}
+
+TEST_F(CacheTest, EntriesArePinned) {
+  Insert(100, 101);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+
+  Insert(100, 102);
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+  ASSERT_EQ(0, deleted_keys_.size());
+
+  cache_->Release(h1);
+  ASSERT_EQ(1, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(101, deleted_values_[0]);
+
+  Erase(100);
+  ASSERT_EQ(-1, Lookup(100));
+  ASSERT_EQ(1, deleted_keys_.size());
+
+  cache_->Release(h2);
+  ASSERT_EQ(2, deleted_keys_.size());
+  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(102, deleted_values_[1]);
+}
+
+TEST_F(CacheTest, EvictionPolicy) {
+  Insert(100, 101);
+  Insert(200, 201);
+  Insert(300, 301);
+  Cache::Handle* h = cache_->Lookup(EncodeKey(300));
+
+  // Frequently used entry must be kept around,
+  // as must things that are still in use.
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    Insert(1000 + i, 2000 + i);
+    ASSERT_EQ(2000 + i, Lookup(1000 + i));
+    ASSERT_EQ(101, Lookup(100));
+  }
+  ASSERT_EQ(101, Lookup(100));
+  ASSERT_EQ(-1, Lookup(200));
+  ASSERT_EQ(301, Lookup(300));
+  cache_->Release(h);
+}
+
+TEST_F(CacheTest, UseExceedsCacheSize) {
+  // Overfill the cache, keeping handles on all inserted entries.
+  std::vector<Cache::Handle*> h;
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    h.push_back(InsertAndReturnHandle(1000 + i, 2000 + i));
+  }
+
+  // Check that all the entries can be found in the cache.
+  for (int i = 0; i < h.size(); i++) {
+    ASSERT_EQ(2000 + i, Lookup(1000 + i));
+  }
+
+  for (int i = 0; i < h.size(); i++) {
+    cache_->Release(h[i]);
+  }
+}
+
+TEST_F(CacheTest, HeavyEntries) {
+  // Add a bunch of light and heavy entries and then count the combined
+  // size of items still in the cache, which must be approximately the
+  // same as the total capacity.
+  const int kLight = 1;
+  const int kHeavy = 10;
+  int added = 0;
+  int index = 0;
+  while (added < 2 * kCacheSize) {
+    const int weight = (index & 1) ? kLight : kHeavy;
+    Insert(index, 1000 + index, weight);
+    added += weight;
+    index++;
+  }
+
+  int cached_weight = 0;
+  for (int i = 0; i < index; i++) {
+    const int weight = (i & 1 ? kLight : kHeavy);
+    int r = Lookup(i);
+    if (r >= 0) {
+      cached_weight += weight;
+      ASSERT_EQ(1000 + i, r);
+    }
+  }
+  ASSERT_LE(cached_weight, kCacheSize + kCacheSize / 10);
+}
+
+TEST_F(CacheTest, NewId) {
+  uint64_t a = cache_->NewId();
+  uint64_t b = cache_->NewId();
+  ASSERT_NE(a, b);
+}
+
+TEST_F(CacheTest, Prune) {
+  Insert(1, 100);
+  Insert(2, 200);
+
+  Cache::Handle* handle = cache_->Lookup(EncodeKey(1));
+  ASSERT_TRUE(handle);
+  cache_->Prune();
+  cache_->Release(handle);
+
+  ASSERT_EQ(100, Lookup(1));
+  ASSERT_EQ(-1, Lookup(2));
+}
+
+TEST_F(CacheTest, ZeroSizeCache) {
+  delete cache_;
+  cache_ = NewLRUCache(0);
+
+  Insert(1, 100);
+  ASSERT_EQ(-1, Lookup(1));
+}
+
+}  // namespace table
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/table.cc b/tensorflow/core/lib/io/table.cc
index 1e68493bfe9..6cd1b21c14d 100644
--- a/tensorflow/core/lib/io/table.cc
+++ b/tensorflow/core/lib/io/table.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/block.h"
+#include "tensorflow/core/lib/io/cache.h"
 #include "tensorflow/core/lib/io/format.h"
 #include "tensorflow/core/lib/io/table_options.h"
 #include "tensorflow/core/lib/io/two_level_iterator.h"
@@ -32,7 +33,7 @@ struct Table::Rep {
   Options options;
   Status status;
   RandomAccessFile* file;
-  // XXX  uint64 cache_id;
+  uint64 cache_id;
 
   BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
   Block* index_block;
@@ -60,21 +61,18 @@ Status Table::Open(const Options& options, RandomAccessFile* file, uint64 size,
   Block* index_block = nullptr;
   if (s.ok()) {
     s = ReadBlock(file, footer.index_handle(), &contents);
-    if (s.ok()) {
-      index_block = new Block(contents);
-    }
   }
 
   if (s.ok()) {
     // We've successfully read the footer and the index block: we're
     // ready to serve requests.
+    index_block = new Block(contents);
     Rep* rep = new Table::Rep;
     rep->options = options;
     rep->file = file;
     rep->metaindex_handle = footer.metaindex_handle();
     rep->index_block = index_block;
-    // XXX    rep->cache_id = (options.block_cache ?
-    // options.block_cache->NewId() : 0);
+    rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
     *table = new Table(rep);
   } else {
     if (index_block) delete index_block;
@@ -89,13 +87,24 @@ static void DeleteBlock(void* arg, void* ignored) {
   delete reinterpret_cast<Block*>(arg);
 }
 
+static void DeleteCachedBlock(const absl::string_view&, void* value) {
+  Block* block = reinterpret_cast<Block*>(value);
+  delete block;
+}
+
+static void ReleaseBlock(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle);
+}
+
 // Convert an index iterator value (i.e., an encoded BlockHandle)
 // into an iterator over the contents of the corresponding block.
 Iterator* Table::BlockReader(void* arg, const StringPiece& index_value) {
   Table* table = reinterpret_cast<Table*>(arg);
-  //  Cache* block_cache = table->rep_->options.block_cache;
+  Cache* block_cache = table->rep_->options.block_cache;
   Block* block = nullptr;
-  //  Cache::Handle* cache_handle = NULL;
+  Cache::Handle* cache_handle = NULL;
 
   BlockHandle handle;
   StringPiece input = index_value;
@@ -105,16 +114,38 @@ Iterator* Table::BlockReader(void* arg, const StringPiece& index_value) {
 
   if (s.ok()) {
     BlockContents contents;
-    s = ReadBlock(table->rep_->file, handle, &contents);
-    if (s.ok()) {
-      block = new Block(contents);
+    if (block_cache != nullptr) {
+      char cache_key_buffer[16];
+      core::EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
+      core::EncodeFixed64(cache_key_buffer + 8, handle.offset());
+      absl::string_view key(cache_key_buffer, sizeof(cache_key_buffer));
+      cache_handle = block_cache->Lookup(key);
+      if (cache_handle != nullptr) {
+        block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
+      } else {
+        s = ReadBlock(table->rep_->file, handle, &contents);
+        if (s.ok()) {
+          block = new Block(contents);
+          cache_handle = block_cache->Insert(key, block, block->size(),
+                                             &DeleteCachedBlock);
+        }
+      }
+    } else {
+      s = ReadBlock(table->rep_->file, handle, &contents);
+      if (s.ok()) {
+        block = new Block(contents);
+      }
     }
   }
 
   Iterator* iter;
   if (block != nullptr) {
     iter = block->NewIterator();
-    iter->RegisterCleanup(&DeleteBlock, block, nullptr);
+    if (cache_handle == nullptr) {
+      iter->RegisterCleanup(&DeleteBlock, block, nullptr);
+    } else {
+      iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
+    }
   } else {
     iter = NewErrorIterator(s);
   }
diff --git a/tensorflow/core/lib/io/table_options.h b/tensorflow/core/lib/io/table_options.h
index 9a36bf16315..d1b43ae7d43 100644
--- a/tensorflow/core/lib/io/table_options.h
+++ b/tensorflow/core/lib/io/table_options.h
@@ -21,6 +21,8 @@ limitations under the License.
 namespace tensorflow {
 namespace table {
 
+class Cache;
+
 // DB contents are stored in a set of blocks, each of which holds a
 // sequence of key,value pairs.  Each block may be compressed before
 // being stored in a file.  The following enum describes which
@@ -60,6 +62,12 @@ struct Options {
   // incompressible, the kSnappyCompression implementation will
   // efficiently detect that and will switch to uncompressed mode.
   CompressionType compression = kSnappyCompression;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-null, use the specified cache for blocks.
+  Cache* block_cache = nullptr;
 };
 
 }  // namespace table
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 299076d8cfd..d8bfd4473f7 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -39,6 +39,10 @@ tf_cc_test(
         "ops_history_v*/*.pbtxt",
         "ops_history.v*.pbtxt",
     ]),
+    tags = [
+        "no_oss",  # TODO(b/150030420): Reenable when fix lands.
+        "notap",  # TODO(b/150030420): Reenable when fix lands.
+    ],
     deps = [
         ":op_compatibility_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index fb40e56829d..54369ce18f2 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -760,6 +760,62 @@ cc_library(
     ] + tf_platform_deps("unbounded_work_queue"),
 )
 
+cc_library(
+    name = "retrying_utils",
+    srcs = [
+        "retrying_utils.cc",
+    ],
+    hdrs = [
+        "retrying_utils.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "retrying_file_system",
+    hdrs = [
+        "retrying_file_system.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":retrying_utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "retrying_file_system_test",
+    size = "small",
+    srcs = ["retrying_file_system_test.cc"],
+    deps = [
+        ":retrying_file_system",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:str_util",
+    ],
+)
+
+tf_cc_test(
+    name = "retrying_utils_test",
+    size = "small",
+    srcs = ["retrying_utils_test.cc"],
+    deps = [
+        ":retrying_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:str_util",
+    ],
+)
+
 # This is a hacky, do-nothing, binary that makes it easy to verify ability to
 # build, link, and in some cases run all of the libraries under platform.
 # Realistically, most of this would be covered by tests but at this point
@@ -791,6 +847,8 @@ cc_binary(
         ":png",
         ":prefetch",
         ":protobuf",
+        ":retrying_utils",
+        ":retrying_file_system",
         ":scanner",
         ":setround",
         ":stacktrace",
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 53c4f6cda1f..fe08edceae9 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -92,14 +92,14 @@ cc_library(
         ":google_auth_provider",
         ":http_request",
         ":ram_file_block_cache",
-        ":retrying_file_system",
-        ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:numbers",
         "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:retrying_file_system",
+        "//tensorflow/core/platform:retrying_utils",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringprintf",
         "@jsoncpp_git//:jsoncpp",
@@ -128,14 +128,14 @@ cc_library(
         ":google_auth_provider",
         ":http_request",
         ":ram_file_block_cache",
-        ":retrying_file_system",
-        ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:numbers",
         "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:retrying_file_system",
+        "//tensorflow/core/platform:retrying_utils",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringprintf",
         "@jsoncpp_git//:jsoncpp",
@@ -200,12 +200,12 @@ cc_library(
     deps = [
         ":compute_engine_metadata_client",
         ":oauth_client",
-        ":retrying_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:base64",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:retrying_utils",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/strings",
         "@jsoncpp_git//:jsoncpp",
@@ -224,9 +224,9 @@ cc_library(
     deps = [
         ":curl_http_request",
         ":http_request",
-        ":retrying_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:retrying_utils",
     ],
 )
 
@@ -283,34 +283,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "retrying_utils",
-    srcs = [
-        "retrying_utils.cc",
-    ],
-    hdrs = [
-        "retrying_utils.h",
-    ],
-    copts = tf_copts(),
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-cc_library(
-    name = "retrying_file_system",
-    hdrs = [
-        "retrying_file_system.h",
-    ],
-    copts = tf_copts(),
-    deps = [
-        ":retrying_utils",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 cc_library(
     name = "time_util",
     srcs = [
@@ -482,20 +454,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "retrying_file_system_test",
-    size = "small",
-    srcs = ["retrying_file_system_test.cc"],
-    deps = [
-        ":retrying_file_system",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:str_util",
-    ],
-)
-
 tf_cc_test(
     name = "time_util_test",
     size = "small",
@@ -506,17 +464,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-tf_cc_test(
-    name = "retrying_utils_test",
-    size = "small",
-    srcs = ["retrying_utils_test.cc"],
-    deps = [
-        ":retrying_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:str_util",
-    ],
-)
diff --git a/tensorflow/core/platform/cloud/compute_engine_metadata_client.h b/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
index d7611615606..164380b4141 100644
--- a/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
+++ b/tensorflow/core/platform/cloud/compute_engine_metadata_client.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_CLOUD_COMPUTE_ENGINE_METADATA_CLIENT_H_
 
 #include "tensorflow/core/platform/cloud/http_request.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include "tensorflow/core/platform/retrying_utils.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 054ad242692..57847d2ea38 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
 #include "tensorflow/core/platform/cloud/ram_file_block_cache.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/cloud/time_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
@@ -40,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/numbers.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/retrying_utils.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index b075cbe9828..98933532b17 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
 #include "tensorflow/core/platform/cloud/gcs_throttle.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
-#include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/retrying_file_system.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index b8d2acd83ff..e8546ca022f 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "include/json/json.h"
 #include "tensorflow/core/platform/base64.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/retrying_utils.h"
 
 namespace tensorflow {
 
@@ -176,12 +176,19 @@ Status GoogleAuthProvider::GetToken(string* t) {
     return Status::OK();
   }
 
-  LOG(WARNING)
-      << "All attempts to get a Google authentication bearer token failed, "
-      << "returning an empty token. Retrieving token from files failed with \""
-      << token_from_files_status.ToString() << "\"."
-      << " Retrieving token from GCE failed with \""
-      << token_from_gce_status.ToString() << "\".";
+  if (skip_gce_check) {
+    LOG(INFO)
+        << "Attempting an empty bearer token since no token was retrieved "
+        << "from files, and GCE metadata check was skipped.";
+  } else {
+    LOG(WARNING)
+        << "All attempts to get a Google authentication bearer token failed, "
+        << "returning an empty token. Retrieving token from files failed with "
+           "\""
+        << token_from_files_status.ToString() << "\"."
+        << " Retrieving token from GCE failed with \""
+        << token_from_gce_status.ToString() << "\".";
+  }
 
   // Public objects can still be accessed with an empty bearer token,
   // so return an empty token instead of failing.
diff --git a/tensorflow/core/platform/default/port.cc b/tensorflow/core/platform/default/port.cc
index 47f4abae3bb..756e7e8a93a 100644
--- a/tensorflow/core/platform/default/port.cc
+++ b/tensorflow/core/platform/default/port.cc
@@ -332,6 +332,16 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 #endif
 }
 
+bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
+                              const struct iovec* iov, size_t iov_cnt) {
+#ifdef TF_USE_SNAPPY
+  return snappy::RawUncompressToIOVec(compressed, compressed_length, iov,
+                                      iov_cnt);
+#else
+  return false;
+#endif
+}
+
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/retrying_file_system.h
similarity index 99%
rename from tensorflow/core/platform/cloud/retrying_file_system.h
rename to tensorflow/core/platform/retrying_file_system.h
index 912778f4dd1..396985066b6 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/retrying_file_system.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/retrying_utils.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/retrying_file_system_test.cc
similarity index 99%
rename from tensorflow/core/platform/cloud/retrying_file_system_test.cc
rename to tensorflow/core/platform/retrying_file_system_test.cc
index b48831ab238..b43c3375265 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/retrying_file_system_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cloud/retrying_file_system.h"
+#include "tensorflow/core/platform/retrying_file_system.h"
 
 #include <fstream>
 
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/retrying_utils.cc
similarity index 98%
rename from tensorflow/core/platform/cloud/retrying_utils.cc
rename to tensorflow/core/platform/retrying_utils.cc
index 1f0c41824bf..1b6fa68c31c 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.cc
+++ b/tensorflow/core/platform/retrying_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include "tensorflow/core/platform/retrying_utils.h"
 
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/platform/cloud/retrying_utils.h b/tensorflow/core/platform/retrying_utils.h
similarity index 100%
rename from tensorflow/core/platform/cloud/retrying_utils.h
rename to tensorflow/core/platform/retrying_utils.h
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/retrying_utils_test.cc
similarity index 98%
rename from tensorflow/core/platform/cloud/retrying_utils_test.cc
rename to tensorflow/core/platform/retrying_utils_test.cc
index 7a2dbacacc8..5b162571067 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/retrying_utils_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include "tensorflow/core/platform/retrying_utils.h"
 
 #include <fstream>
 
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index a5494d5c318..d174b108279 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -33,6 +33,8 @@ tf_cc_binary(
     linkshared = 1,
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/platform:retrying_file_system",
+        "//tensorflow/core/platform:retrying_utils",
         "@aws",
         "@com_google_protobuf//:protobuf_headers",
         "@curl",
diff --git a/tensorflow/core/platform/s3/aws_crypto.cc b/tensorflow/core/platform/s3/aws_crypto.cc
index 90e46d6c1da..6a473027ebd 100644
--- a/tensorflow/core/platform/s3/aws_crypto.cc
+++ b/tensorflow/core/platform/s3/aws_crypto.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/platform/s3/aws_crypto.h"
-#include <openssl/hmac.h>
-#include <openssl/sha.h>
 
 #include <aws/core/utils/crypto/HashResult.h>
 #include <aws/s3/S3Client.h>
+#include <openssl/hmac.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
 
 namespace tensorflow {
 
@@ -100,6 +101,22 @@ class AWSSha256OpenSSLImpl : public Aws::Utils::Crypto::Hash {
   }
 };
 
+class AWSSecureRandomBytesImpl : public Aws::Utils::Crypto::SecureRandomBytes {
+ public:
+  AWSSecureRandomBytesImpl() {}
+  virtual ~AWSSecureRandomBytesImpl() = default;
+  virtual void GetBytes(unsigned char* buffer, size_t bufferSize) override {
+    assert(buffer);
+    int success = RAND_bytes(buffer, static_cast<int>(bufferSize));
+    if (success != 1) {
+      m_failure = true;
+    }
+  }
+
+ private:
+  bool m_failure;
+};
+
 std::shared_ptr<Aws::Utils::Crypto::Hash>
 AWSSHA256Factory::CreateImplementation() const {
   return Aws::MakeShared<AWSSha256OpenSSLImpl>(AWSCryptoAllocationTag);
@@ -110,4 +127,9 @@ AWSSHA256HmacFactory::CreateImplementation() const {
   return Aws::MakeShared<AWSSha256HMACOpenSSLImpl>(AWSCryptoAllocationTag);
 }
 
+std::shared_ptr<Aws::Utils::Crypto::SecureRandomBytes>
+AWSSecureRandomFactory::CreateImplementation() const {
+  return Aws::MakeShared<AWSSecureRandomBytesImpl>(AWSCryptoAllocationTag);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/aws_crypto.h b/tensorflow/core/platform/s3/aws_crypto.h
index f05771b904a..d0c41eeef83 100644
--- a/tensorflow/core/platform/s3/aws_crypto.h
+++ b/tensorflow/core/platform/s3/aws_crypto.h
@@ -16,6 +16,7 @@ limitations under the License.
 #include <aws/core/utils/crypto/Factories.h>
 #include <aws/core/utils/crypto/HMAC.h>
 #include <aws/core/utils/crypto/Hash.h>
+#include <aws/core/utils/crypto/SecureRandom.h>
 
 namespace tensorflow {
 static const char* AWSCryptoAllocationTag = "AWSCryptoAllocation";
@@ -32,4 +33,10 @@ class AWSSHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory {
       const override;
 };
 
+class AWSSecureRandomFactory : public Aws::Utils::Crypto::SecureRandomFactory {
+ public:
+  std::shared_ptr<Aws::Utils::Crypto::SecureRandomBytes> CreateImplementation()
+      const override;
+};
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 27e7a4e01fb..9749fec8ee5 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include <aws/core/utils/logging/LogSystemInterface.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
+#include <aws/s3/model/AbortMultipartUploadRequest.h>
+#include <aws/s3/model/CompleteMultipartUploadRequest.h>
+#include <aws/s3/model/CompletedMultipartUpload.h>
+#include <aws/s3/model/CompletedPart.h>
 #include <aws/s3/model/CopyObjectRequest.h>
 #include <aws/s3/model/DeleteObjectRequest.h>
 #include <aws/s3/model/GetObjectRequest.h>
@@ -29,7 +33,9 @@ limitations under the License.
 #include <aws/s3/model/HeadObjectRequest.h>
 #include <aws/s3/model/ListObjectsRequest.h>
 #include <aws/s3/model/PutObjectRequest.h>
+#include <aws/s3/model/UploadPartCopyRequest.h>
 
+#include <cmath>
 #include <cstdlib>
 
 #include "tensorflow/core/platform/file_system_helper.h"
@@ -44,7 +50,12 @@ namespace tensorflow {
 namespace {
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
+static const int64 kS3TimeoutMsec = 300000;                       // 5 min
+static const uint64 kS3MultiPartCopyPartSize = 50 * 1024 * 1024;  // 50MB
 static const int kS3GetChildrenMaxKeys = 100;
+static const int kExecutorPoolSize = 5;
+static const int kUploadRetries = 5;
+static const char* kExecutorTag = "TransferManagerExecutor";
 
 Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
   static mutex cfg_lock(LINKER_INITIALIZED);
@@ -108,22 +119,22 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
         cfg.verifySSL = true;
       }
     }
-    const char* connect_timeout = getenv("S3_CONNECT_TIMEOUT_MSEC");
-    if (connect_timeout) {
-      int64 timeout;
-
-      if (strings::safe_strto64(connect_timeout, &timeout)) {
-        cfg.connectTimeoutMs = timeout;
-      }
+    // if these timeouts are low, you may see an error when
+    // uploading/downloading large files: Unable to connect to endpoint
+    const char* connect_timeout_str = getenv("S3_CONNECT_TIMEOUT_MSEC");
+    int64 connect_timeout = kS3TimeoutMsec;
+    if (connect_timeout_str) {
+      // if conversion is unsafe, below method doesn't modify connect_timeout
+      strings::safe_strto64(connect_timeout_str, &connect_timeout);
     }
-    const char* request_timeout = getenv("S3_REQUEST_TIMEOUT_MSEC");
-    if (request_timeout) {
-      int64 timeout;
+    cfg.connectTimeoutMs = connect_timeout;
 
-      if (strings::safe_strto64(request_timeout, &timeout)) {
-        cfg.requestTimeoutMs = timeout;
-      }
+    const char* request_timeout_str = getenv("S3_REQUEST_TIMEOUT_MSEC");
+    int64 request_timeout = kS3TimeoutMsec;
+    if (request_timeout_str) {
+      strings::safe_strto64(request_timeout_str, &request_timeout);
     }
+    cfg.requestTimeoutMs = request_timeout;
     const char* ca_file = getenv("S3_CA_FILE");
     if (ca_file) {
       cfg.caFile = Aws::String(ca_file);
@@ -148,6 +159,18 @@ void ShutdownClient(Aws::S3::S3Client* s3_client) {
   }
 }
 
+void ShutdownTransferManager(Aws::Transfer::TransferManager* transfer_manager) {
+  if (transfer_manager != nullptr) {
+    delete transfer_manager;
+  }
+}
+
+void ShutdownExecutor(Aws::Utils::Threading::PooledThreadExecutor* executor) {
+  if (executor != nullptr) {
+    delete executor;
+  }
+}
+
 Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
                    string* object) {
   if (!bucket || !object) {
@@ -173,6 +196,23 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
   return Status::OK();
 }
 
+static Status CheckForbiddenError(
+    const Aws::Client::AWSError<Aws::S3::S3Errors>& error) {
+  if (error.GetResponseCode() == Aws::Http::HttpResponseCode::FORBIDDEN) {
+    return errors::FailedPrecondition(
+        "AWS Credentials have not been set properly. "
+        "Unable to access the specified S3 location");
+  } else {
+    return Status::OK();
+  }
+}
+
+static Status CreateStatusFromAwsError(
+    const Aws::Client::AWSError<Aws::S3::S3Errors>& error) {
+  TF_RETURN_IF_ERROR(CheckForbiddenError(error));
+  return errors::Unknown(error.GetExceptionName(), ": ", error.GetMessage());
+}
+
 class S3RandomAccessFile : public RandomAccessFile {
  public:
   S3RandomAccessFile(const string& bucket, const string& object,
@@ -194,9 +234,14 @@ class S3RandomAccessFile : public RandomAccessFile {
     });
     auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
     if (!getObjectOutcome.IsSuccess()) {
-      n = 0;
-      *result = StringPiece(scratch, n);
-      return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+      auto error = getObjectOutcome.GetError();
+      if (error.GetResponseCode() ==
+          Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE) {
+        n = 0;
+        *result = StringPiece(scratch, n);
+        return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+      }
+      return CreateStatusFromAwsError(error);
     }
     n = getObjectOutcome.GetResult().GetContentLength();
     getObjectOutcome.GetResult().GetBody().read(scratch, n);
@@ -213,11 +258,14 @@ class S3RandomAccessFile : public RandomAccessFile {
 
 class S3WritableFile : public WritableFile {
  public:
-  S3WritableFile(const string& bucket, const string& object,
-                 std::shared_ptr<Aws::S3::S3Client> s3_client)
+  S3WritableFile(
+      const string& bucket, const string& object,
+      std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager,
+      std::shared_ptr<Aws::S3::S3Client> s3_client)
       : bucket_(bucket),
         object_(object),
         s3_client_(s3_client),
+        transfer_manager_(transfer_manager),
         sync_needed_(true),
         outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
             kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
@@ -260,19 +308,32 @@ class S3WritableFile : public WritableFile {
     if (!sync_needed_) {
       return Status::OK();
     }
-    Aws::S3::Model::PutObjectRequest putObjectRequest;
-    putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     long offset = outfile_->tellp();
-    outfile_->seekg(0);
-    putObjectRequest.SetBody(outfile_);
-    putObjectRequest.SetContentLength(offset);
-    auto putObjectOutcome = this->s3_client_->PutObject(putObjectRequest);
+    std::shared_ptr<Aws::Transfer::TransferHandle> handle =
+        transfer_manager_.get()->UploadFile(
+            outfile_, bucket_.c_str(), object_.c_str(),
+            "application/octet-stream", Aws::Map<Aws::String, Aws::String>());
+    handle->WaitUntilFinished();
+    int retries = 0;
+
+    while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
+           retries++ < kUploadRetries) {
+      // if multipart upload was used, only the failed parts will be re-sent
+      VLOG(1) << "Retrying Upload of s3://" << bucket_ << "/" << object_
+              << " after failure. Current retry count:" << retries;
+      transfer_manager_.get()->RetryUpload(outfile_, handle);
+      handle->WaitUntilFinished();
+    }
+
+    if (handle->GetStatus() != Aws::Transfer::TransferStatus::COMPLETED) {
+      auto error = handle->GetLastError();
+      TF_RETURN_IF_ERROR(CheckForbiddenError(error));
+      return errors::Unknown(error.GetExceptionName(), ": ",
+                             handle->GetFailedParts().size(), " failed parts. ",
+                             handle->GetLastError().GetMessage());
+    }
     outfile_->clear();
     outfile_->seekp(offset);
-    if (!putObjectOutcome.IsSuccess()) {
-      return errors::Unknown(putObjectOutcome.GetError().GetExceptionName(),
-                             ": ", putObjectOutcome.GetError().GetMessage());
-    }
     sync_needed_ = false;
     return Status::OK();
   }
@@ -281,6 +342,7 @@ class S3WritableFile : public WritableFile {
   string bucket_;
   string object_;
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
   bool sync_needed_;
   std::shared_ptr<Aws::Utils::TempFile> outfile_;
 };
@@ -300,13 +362,25 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 }  // namespace
 
 S3FileSystem::S3FileSystem()
-    : s3_client_(nullptr, ShutdownClient), client_lock_() {}
+    : s3_client_(nullptr, ShutdownClient),
+      initialization_lock_(),
+      transfer_manager_(nullptr, ShutdownTransferManager),
+      executor_(nullptr, ShutdownExecutor) {
+  const char* part_size_str = getenv("S3_MULTI_PART_COPY_PART_SIZE");
+  multi_part_copy_part_size_ = kS3MultiPartCopyPartSize;
+  if (part_size_str) {
+    uint64 part_size_num;
+    if (strings::safe_strtou64(part_size_str, &part_size_num)) {
+      multi_part_copy_part_size_ = part_size_num;
+    }
+  }
+}
 
 S3FileSystem::~S3FileSystem() {}
 
 // Initializes s3_client_, if needed, and returns it.
 std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
-  std::lock_guard<mutex> lock(this->client_lock_);
+  std::lock_guard<mutex> lock(this->initialization_lock_);
 
   if (this->s3_client_.get() == nullptr) {
     AWSLogSystem::InitializeAWSLogging();
@@ -318,6 +392,9 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
     options.cryptoOptions.sha256HMACFactory_create_fn = []() {
       return Aws::MakeShared<AWSSHA256HmacFactory>(AWSCryptoAllocationTag);
     };
+    options.cryptoOptions.secureRandomFactory_create_fn = []() {
+      return Aws::MakeShared<AWSSecureRandomFactory>(AWSCryptoAllocationTag);
+    };
     Aws::InitAPI(options);
 
     // The creation of S3Client disables virtual addressing:
@@ -334,6 +411,33 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
   return this->s3_client_;
 }
 
+std::shared_ptr<Aws::Transfer::TransferManager>
+S3FileSystem::GetTransferManager() {
+  std::shared_ptr<Aws::S3::S3Client> s3_client = this->GetS3Client();
+  std::lock_guard<mutex> lock(this->initialization_lock_);
+  if (this->transfer_manager_.get() == nullptr) {
+    Aws::Transfer::TransferManagerConfiguration config(
+        this->GetExecutor().get());
+    config.s3Client = s3_client;
+    config.bufferSize = this->multi_part_copy_part_size_;
+    // must be larger than pool size * multi_part_copy_part_size
+    config.transferBufferMaxHeapSize =
+        (kExecutorPoolSize + 1) * this->multi_part_copy_part_size_;
+    this->transfer_manager_ = Aws::Transfer::TransferManager::Create(config);
+  }
+  return this->transfer_manager_;
+}
+
+std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor>
+S3FileSystem::GetExecutor() {
+  if (this->executor_.get() == nullptr) {
+    this->executor_ =
+        Aws::MakeShared<Aws::Utils::Threading::PooledThreadExecutor>(
+            kExecutorTag, kExecutorPoolSize);
+  }
+  return this->executor_;
+}
+
 Status S3FileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
@@ -346,7 +450,8 @@ Status S3FileSystem::NewWritableFile(const string& fname,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
+  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
+                                   this->GetS3Client()));
   return Status::OK();
 }
 
@@ -361,7 +466,8 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
+  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
+                                   this->GetS3Client()));
 
   while (true) {
     status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -425,8 +531,7 @@ Status S3FileSystem::GetChildren(const string& dir,
     auto listObjectsOutcome =
         this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
-      return errors::Unknown(listObjectsOutcome.GetError().GetExceptionName(),
-                             ": ", listObjectsOutcome.GetError().GetMessage());
+      return CreateStatusFromAwsError(listObjectsOutcome.GetError());
     }
 
     listObjectsResult = listObjectsOutcome.GetResult();
@@ -460,8 +565,7 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
     headBucketRequest.WithBucket(bucket.c_str());
     auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
-      return errors::Unknown(headBucketOutcome.GetError().GetExceptionName(),
-                             ": ", headBucketOutcome.GetError().GetMessage());
+      return CreateStatusFromAwsError(headBucketOutcome.GetError());
     }
     stats->length = 0;
     stats->is_directory = 1;
@@ -481,6 +585,8 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
     stats->mtime_nsec =
         headObjectOutcome.GetResult().GetLastModified().Millis() * 1e6;
     found = true;
+  } else {
+    TF_RETURN_IF_ERROR(CheckForbiddenError(headObjectOutcome.GetError()));
   }
   string prefix = object;
   if (prefix.back() != '/') {
@@ -495,11 +601,15 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   auto listObjectsOutcome =
       this->GetS3Client()->ListObjects(listObjectsRequest);
   if (listObjectsOutcome.IsSuccess()) {
-    if (listObjectsOutcome.GetResult().GetContents().size() > 0) {
+    auto listObjects = listObjectsOutcome.GetResult().GetContents();
+    if (listObjects.size() > 0) {
       stats->length = 0;
       stats->is_directory = 1;
+      stats->mtime_nsec = listObjects[0].GetLastModified().Millis() * 1e6;
       found = true;
     }
+  } else {
+    TF_RETURN_IF_ERROR(CheckForbiddenError(listObjectsOutcome.GetError()));
   }
   if (!found) {
     return errors::NotFound("Object ", fname, " does not exist");
@@ -522,8 +632,7 @@ Status S3FileSystem::DeleteFile(const string& fname) {
   auto deleteObjectOutcome =
       this->GetS3Client()->DeleteObject(deleteObjectRequest);
   if (!deleteObjectOutcome.IsSuccess()) {
-    return errors::Unknown(deleteObjectOutcome.GetError().GetExceptionName(),
-                           ": ", deleteObjectOutcome.GetError().GetMessage());
+    return CreateStatusFromAwsError(deleteObjectOutcome.GetError());
   }
   return Status::OK();
 }
@@ -537,6 +646,7 @@ Status S3FileSystem::CreateDir(const string& dirname) {
     headBucketRequest.WithBucket(bucket.c_str());
     auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
+      TF_RETURN_IF_ERROR(CheckForbiddenError(headBucketOutcome.GetError()));
       return errors::NotFound("The bucket ", bucket, " was not found.");
     }
     return Status::OK();
@@ -545,9 +655,11 @@ Status S3FileSystem::CreateDir(const string& dirname) {
   if (filename.back() != '/') {
     filename.push_back('/');
   }
-  std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(NewWritableFile(filename, &file));
-  TF_RETURN_IF_ERROR(file->Close());
+  if (!this->FileExists(filename).ok()) {
+    std::unique_ptr<WritableFile> file;
+    TF_RETURN_IF_ERROR(NewWritableFile(filename, &file));
+    TF_RETURN_IF_ERROR(file->Close());
+  }
   return Status::OK();
 }
 
@@ -571,7 +683,10 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
     auto contents = listObjectsOutcome.GetResult().GetContents();
     if (contents.size() > 1 ||
         (contents.size() == 1 && contents[0].GetKey() != prefix.c_str())) {
-      return errors::FailedPrecondition("Cannot delete a non-empty directory.");
+      return errors::Unknown(
+          "Cannot delete a non-empty directory. "
+          "This operation will be retried in case this "
+          "is due to S3's eventual consistency.");
     }
     if (contents.size() == 1 && contents[0].GetKey() == prefix.c_str()) {
       string filename = dirname;
@@ -580,6 +695,8 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
       }
       return DeleteFile(filename);
     }
+  } else {
+    TF_RETURN_IF_ERROR(CheckForbiddenError(listObjectsOutcome.GetError()));
   }
   return Status::OK();
 }
@@ -591,6 +708,260 @@ Status S3FileSystem::GetFileSize(const string& fname, uint64* file_size) {
   return Status::OK();
 }
 
+void S3FileSystem::MultiPartCopyCallback(
+    const Aws::S3::Model::UploadPartCopyRequest& request,
+    const Aws::S3::Model::UploadPartCopyOutcome& uploadPartCopyOutcome,
+    const std::shared_ptr<const Aws::Client::AsyncCallerContext>& context) {
+  std::shared_ptr<tensorflow::MultiPartCopyAsyncContext> multiPartContext =
+      std::const_pointer_cast<tensorflow::MultiPartCopyAsyncContext>(
+          std::static_pointer_cast<const tensorflow::MultiPartCopyAsyncContext>(
+              context));
+
+  {
+    std::unique_lock<std::mutex> lock(*multiPartContext->multi_part_copy_mutex);
+
+    Status status;
+    if (uploadPartCopyOutcome.IsSuccess()) {
+      // success
+      Aws::String eTag =
+          uploadPartCopyOutcome.GetResult().GetCopyPartResult().GetETag();
+      multiPartContext->eTag = eTag;
+      status = Status::OK();
+    } else {
+      LOG(ERROR) << "Error when copying part " << multiPartContext->partNumber
+                 << " " << uploadPartCopyOutcome.GetError().GetMessage();
+      status =
+          errors::Unknown(uploadPartCopyOutcome.GetError().GetExceptionName(),
+                          ": ", uploadPartCopyOutcome.GetError().GetMessage());
+    }
+
+    (*multiPartContext->finishedPartStates)[multiPartContext->partNumber] =
+        multiPartContext->incompletePartStates->at(
+            multiPartContext->partNumber);
+    multiPartContext->finishedPartStates->at(multiPartContext->partNumber)
+        .status = status;
+    multiPartContext->incompletePartStates->erase(multiPartContext->partNumber);
+    // Notify the thread that started the operation
+    multiPartContext->multi_part_copy_cv->notify_one();
+  }
+}
+
+Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
+                              const Aws::String& source_key,
+                              const Aws::String& target_bucket,
+                              const Aws::String& target_key) {
+  Aws::String source = Aws::String((source_bucket + "/" + source_key).c_str());
+  Aws::String source_full_path = Aws::String("s3://") + source;
+  uint64 file_length;
+  TF_RETURN_IF_ERROR(
+      this->GetFileSize(string(source_full_path.c_str()), &file_length));
+  int num_parts;
+  if (file_length <= multi_part_copy_part_size_) {
+    num_parts = 1;
+  } else {
+    num_parts = ceil((float)file_length / multi_part_copy_part_size_);
+  }
+
+  if (num_parts == 1) {
+    return SimpleCopy(source, target_bucket, target_key);
+  } else if (num_parts > 10000) {
+    string message = strings::StrCat(
+        "MultiPartCopy with number of parts more than 10000 is not supported. "
+        "Your object ",
+        source, " required ", num_parts,
+        " as multi_part_copy_part_size is set to ", multi_part_copy_part_size_,
+        ". You can control this part size using the environment variable ",
+        "S3_MULTI_PART_COPY_PART_SIZE to increase it.");
+    return tensorflow::errors::Unimplemented(message);
+  } else {
+    return MultiPartCopy(source, target_bucket, target_key, num_parts,
+                         file_length);
+  }
+}
+
+Status S3FileSystem::SimpleCopy(const Aws::String& source,
+                                const Aws::String& target_bucket,
+                                const Aws::String& target_key) {
+  VLOG(1) << "SimpleCopy from " << source << " to: " << target_bucket << "/"
+          << target_key;
+  Aws::S3::Model::CopyObjectRequest copyObjectRequest;
+  copyObjectRequest.SetBucket(target_bucket.c_str());
+  copyObjectRequest.SetKey(target_key);
+  copyObjectRequest.SetCopySource(source);
+  auto copyObjectOutcome = this->GetS3Client()->CopyObject(copyObjectRequest);
+  if (!copyObjectOutcome.IsSuccess()) {
+    return CreateStatusFromAwsError(copyObjectOutcome.GetError());
+  }
+  return Status::OK();
+}
+
+Status S3FileSystem::MultiPartCopy(const Aws::String& source,
+                                   const Aws::String& target_bucket,
+                                   const Aws::String& target_key,
+                                   const int num_parts,
+                                   const uint64 file_length) {
+  VLOG(1) << "MultiPartCopy from " << source << " to: " << target_bucket << "/"
+          << target_key;
+  Aws::S3::Model::CreateMultipartUploadRequest multipartUploadRequest;
+  multipartUploadRequest.SetBucket(target_bucket);
+  multipartUploadRequest.SetKey(target_key);
+
+  auto multipartUploadOutcome =
+      this->GetS3Client()->CreateMultipartUpload(multipartUploadRequest);
+  if (!multipartUploadOutcome.IsSuccess()) {
+    return CreateStatusFromAwsError(multipartUploadOutcome.GetError());
+  }
+
+  Aws::String uploadID = multipartUploadOutcome.GetResult().GetUploadId();
+  VLOG(1) << "Copying from " << source << " in " << num_parts
+          << " parts of size " << multi_part_copy_part_size_ << " each";
+  Aws::S3::Model::CompletedMultipartUpload completedMPURequest;
+
+  // passed to each callback keyed by partNumber
+  std::map<int, std::shared_ptr<tensorflow::MultiPartCopyAsyncContext>>
+      partContexts;
+  // keeps track of incompleteParts keyed by partNumber
+  std::map<int, PartState> incompletePartStates;
+  // S3 API partNumber starts from 1
+  for (int partNumber = 1; partNumber <= num_parts; partNumber++) {
+    PartState ps;
+    ps.partNumber = partNumber;
+    incompletePartStates[partNumber] = ps;
+  }
+
+  // keeps track of completed parts keyed by partNumber
+  std::map<int, PartState> finishedPartStates;
+  // mutex which protects access of the partStates map
+  std::mutex multi_part_copy_mutex;
+  // condition variable to be used with above mutex for synchronization
+  std::condition_variable multi_part_copy_cv;
+
+  int retry_count_ = 3;
+  while (retry_count_-- > 0) {
+    // queue up parts
+    for (std::map<int, PartState>::iterator it = incompletePartStates.begin();
+         it != incompletePartStates.end(); it++) {
+      int partNumber = it->first;
+      uint64 startPos = (partNumber - 1) * multi_part_copy_part_size_;
+      uint64 endPos = startPos + kS3MultiPartCopyPartSize - 1;
+      if (endPos >= file_length) {
+        endPos = file_length - 1;
+      }
+
+      string range = strings::StrCat("bytes=", startPos, "-", endPos);
+
+      Aws::S3::Model::UploadPartCopyRequest uploadPartCopyRequest;
+      uploadPartCopyRequest.SetBucket(target_bucket);
+      uploadPartCopyRequest.SetKey(target_key);
+      uploadPartCopyRequest.SetCopySource(source.c_str());
+      uploadPartCopyRequest.SetCopySourceRange(range.c_str());
+      uploadPartCopyRequest.SetPartNumber(partNumber);
+      uploadPartCopyRequest.SetUploadId(uploadID);
+
+      auto multiPartContext =
+          Aws::MakeShared<tensorflow::MultiPartCopyAsyncContext>(
+              "MultiPartCopyContext");
+
+      multiPartContext->partNumber = partNumber;
+      multiPartContext->incompletePartStates = &incompletePartStates;
+      multiPartContext->finishedPartStates = &finishedPartStates;
+      multiPartContext->multi_part_copy_mutex = &multi_part_copy_mutex;
+      multiPartContext->multi_part_copy_cv = &multi_part_copy_cv;
+
+      // replace with current context
+      partContexts[partNumber] = multiPartContext;
+
+      auto callback =
+          [this](const Aws::S3::S3Client* client,
+                 const Aws::S3::Model::UploadPartCopyRequest& request,
+                 const Aws::S3::Model::UploadPartCopyOutcome& outcome,
+                 const std::shared_ptr<const Aws::Client::AsyncCallerContext>&
+                     context) {
+            this->MultiPartCopyCallback(request, outcome, context);
+          };
+
+      this->GetS3Client()->UploadPartCopyAsync(uploadPartCopyRequest, callback,
+                                               multiPartContext);
+    }
+    // wait till they finish
+    {
+      std::unique_lock<std::mutex> lock(multi_part_copy_mutex);
+      // wait on the mutex until notify is called
+      // then check the finished parts as there could be false notifications
+      multi_part_copy_cv.wait(lock, [&finishedPartStates, num_parts] {
+        return finishedPartStates.size() == num_parts;
+      });
+    }
+    // check if there was any error for any part
+    for (int partNumber = 1; partNumber <= num_parts; partNumber++) {
+      if (finishedPartStates[partNumber].status != Status::OK()) {
+        if (retry_count_ <= 0) {
+          if (finishedPartStates[partNumber].status != Status::OK()) {
+            TF_RETURN_IF_ERROR(
+                AbortMultiPartCopy(target_bucket, target_key, uploadID));
+            return finishedPartStates[partNumber].status;
+          }
+        } else {
+          // retry part
+          LOG(ERROR) << "Retrying failed copy of part " << partNumber
+                     << " due to an error with S3. ";
+          PartState ps;
+          ps.partNumber = partNumber;
+          incompletePartStates[partNumber] = ps;
+          finishedPartStates.erase(partNumber);
+        }
+      }
+    }
+  }
+
+  // if there was an error still in any part, it would abort and return in the
+  // above loop set the eTag of completed Part to the final CompletedMPURequest
+  // note these parts have to be added in order
+  for (int partNumber = 1; partNumber <= num_parts; partNumber++) {
+    Aws::S3::Model::CompletedPart completedPart;
+    completedPart.SetPartNumber(partNumber);
+    completedPart.SetETag(partContexts[partNumber]->eTag);
+    completedMPURequest.AddParts(completedPart);
+  }
+
+  Status finalStatus = CompleteMultiPartCopy(target_bucket, target_key,
+                                             uploadID, completedMPURequest);
+  if (finalStatus != Status::OK()) {
+    TF_RETURN_IF_ERROR(AbortMultiPartCopy(target_bucket, target_key, uploadID));
+  }
+  return finalStatus;
+}
+
+Status S3FileSystem::AbortMultiPartCopy(Aws::String target_bucket,
+                                        Aws::String target_key,
+                                        Aws::String uploadID) {
+  Aws::S3::Model::AbortMultipartUploadRequest abortRequest;
+  abortRequest.WithBucket(target_bucket)
+      .WithKey(target_key)
+      .WithUploadId(uploadID);
+  auto abortOutcome = this->GetS3Client()->AbortMultipartUpload(abortRequest);
+  if (!abortOutcome.IsSuccess()) {
+    return CreateStatusFromAwsError(abortOutcome.GetError());
+  }
+  return Status::OK();
+}
+
+Status S3FileSystem::CompleteMultiPartCopy(
+    Aws::String target_bucket, Aws::String target_key, Aws::String uploadID,
+    Aws::S3::Model::CompletedMultipartUpload completedMPURequest) {
+  Aws::S3::Model::CompleteMultipartUploadRequest completeRequest;
+  completeRequest.SetBucket(target_bucket);
+  completeRequest.SetKey(target_key);
+  completeRequest.SetUploadId(uploadID);
+  completeRequest.SetMultipartUpload(completedMPURequest);
+  auto completeOutcome =
+      this->GetS3Client()->CompleteMultipartUpload(completeRequest);
+  if (!completeOutcome.IsSuccess()) {
+    return CreateStatusFromAwsError(completeOutcome.GetError());
+  }
+  return Status::OK();
+}
+
 Status S3FileSystem::RenameFile(const string& src, const string& target) {
   string src_bucket, src_object, target_bucket, target_object;
   TF_RETURN_IF_ERROR(ParseS3Path(src, false, &src_bucket, &src_object));
@@ -621,8 +992,7 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
     auto listObjectsOutcome =
         this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
-      return errors::Unknown(listObjectsOutcome.GetError().GetExceptionName(),
-                             ": ", listObjectsOutcome.GetError().GetMessage());
+      return CreateStatusFromAwsError(listObjectsOutcome.GetError());
     }
 
     listObjectsResult = listObjectsOutcome.GetResult();
@@ -630,19 +1000,10 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       Aws::String src_key = object.GetKey();
       Aws::String target_key = src_key;
       target_key.replace(0, src_object.length(), target_object.c_str());
-      Aws::String source =
-          Aws::String(src_bucket.c_str()) + "/" + src_key.c_str();
 
-      copyObjectRequest.SetBucket(target_bucket.c_str());
-      copyObjectRequest.SetKey(target_key);
-      copyObjectRequest.SetCopySource(source);
-
-      auto copyObjectOutcome =
-          this->GetS3Client()->CopyObject(copyObjectRequest);
-      if (!copyObjectOutcome.IsSuccess()) {
-        return errors::Unknown(copyObjectOutcome.GetError().GetExceptionName(),
-                               ": ", copyObjectOutcome.GetError().GetMessage());
-      }
+      TF_RETURN_IF_ERROR(CopyFile(Aws::String(src_bucket.c_str()), src_key,
+                                  Aws::String(target_bucket.c_str()),
+                                  target_key));
 
       deleteObjectRequest.SetBucket(src_bucket.c_str());
       deleteObjectRequest.SetKey(src_key.c_str());
@@ -650,9 +1011,7 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       auto deleteObjectOutcome =
           this->GetS3Client()->DeleteObject(deleteObjectRequest);
       if (!deleteObjectOutcome.IsSuccess()) {
-        return errors::Unknown(
-            deleteObjectOutcome.GetError().GetExceptionName(), ": ",
-            deleteObjectOutcome.GetError().GetMessage());
+        return CreateStatusFromAwsError(deleteObjectOutcome.GetError());
       }
     }
     listObjectsRequest.SetMarker(listObjectsResult.GetNextMarker());
@@ -666,6 +1025,6 @@ Status S3FileSystem::HasAtomicMove(const string& path, bool* has_atomic_move) {
   return Status::OK();
 }
 
-REGISTER_FILE_SYSTEM("s3", S3FileSystem);
+REGISTER_FILE_SYSTEM("s3", RetryingS3FileSystem);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 8b6fa81f164..9686ca4c568 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -16,12 +16,34 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
 #define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
 
+#include <aws/core/utils/StringUtils.h>
+#include <aws/core/utils/threading/Executor.h>
 #include <aws/s3/S3Client.h>
+#include <aws/s3/model/CompletedMultipartUpload.h>
+#include <aws/transfer/TransferManager.h>
+
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/retrying_file_system.h"
 
 namespace tensorflow {
 
+struct PartState {
+  int partNumber;
+  Status status;
+};
+
+struct MultiPartCopyAsyncContext : public Aws::Client::AsyncCallerContext {
+  int partNumber;
+  std::map<int, PartState>* incompletePartStates;
+  std::map<int, PartState>* finishedPartStates;
+  Aws::String eTag;
+
+  // lock and cv for multi part copy
+  std::mutex* multi_part_copy_mutex;
+  std::condition_variable* multi_part_copy_cv;
+};
+
 class S3FileSystem : public FileSystem {
  public:
   S3FileSystem();
@@ -76,10 +98,53 @@ class S3FileSystem : public FileSystem {
   // This S3 Client does not support Virtual Hosted–Style Method
   // for a bucket.
   std::shared_ptr<Aws::S3::S3Client> GetS3Client();
-
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
-  // Lock held when checking for s3_client_ initialization.
-  mutex client_lock_;
+
+  // Returns the member transfer manager, initializing as-needed.
+  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager();
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+
+  // Returns the member executor for transfer manager, initializing as-needed.
+  std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> GetExecutor();
+  std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> executor_;
+
+  Status CopyFile(const Aws::String& source_bucket,
+                  const Aws::String& source_key,
+                  const Aws::String& target_bucket,
+                  const Aws::String& target_key);
+  Status SimpleCopy(const Aws::String& source, const Aws::String& target_bucket,
+                    const Aws::String& target_key);
+  Status MultiPartCopy(const Aws::String& source,
+                       const Aws::String& target_bucket,
+                       const Aws::String& target_key, const int num_parts,
+                       const uint64 file_length);
+  Status AbortMultiPartCopy(Aws::String target_bucket, Aws::String target_key,
+                            Aws::String uploadID);
+  Status CompleteMultiPartCopy(
+      Aws::String target_bucket, Aws::String target_key, Aws::String uploadId,
+      Aws::S3::Model::CompletedMultipartUpload completedMPURequest);
+  void MultiPartCopyCallback(
+      const Aws::S3::Model::UploadPartCopyRequest& request,
+      const Aws::S3::Model::UploadPartCopyOutcome& uploadPartCopyOutcome,
+      const std::shared_ptr<const Aws::Client::AsyncCallerContext>&
+          multiPartContext);
+
+  // Lock held when checking for s3_client_ and transfer_manager_ initialization
+  mutex initialization_lock_;
+
+  // size to split objects during multipart copy
+  uint64 multi_part_copy_part_size_;
+};
+
+/// S3 implementation of a file system with retry on failures.
+class RetryingS3FileSystem : public RetryingFileSystem<S3FileSystem> {
+ public:
+  RetryingS3FileSystem()
+      : RetryingFileSystem(
+            std::unique_ptr<S3FileSystem>(new S3FileSystem),
+            RetryConfig(100000 /* init_delay_time_us */,
+                        32000000 /* max_delay_time_us */, 10 /* max_retries */
+                        )) {}
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/snappy.h b/tensorflow/core/platform/snappy.h
index 5477b097ef0..df06f3dcc1e 100644
--- a/tensorflow/core/platform/snappy.h
+++ b/tensorflow/core/platform/snappy.h
@@ -18,6 +18,17 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
+#if !defined(PLATFORM_WINDOWS)
+#include <sys/uio.h>
+#else
+namespace tensorflow {
+struct iovec {
+  void* iov_base;
+  size_t iov_len;
+};
+}  // namespace tensorflow
+#endif
+
 namespace tensorflow {
 namespace port {
 
@@ -28,6 +39,9 @@ bool Snappy_GetUncompressedLength(const char* input, size_t length,
                                   size_t* result);
 bool Snappy_Uncompress(const char* input, size_t length, char* output);
 
+bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
+                              const struct iovec* iov, size_t iov_cnt);
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 2303b587ce6..547af76bdf6 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -157,6 +157,17 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 #endif
 }
 
+bool Snappy_UncompressToIOVec(const char* compressed, size_t compressed_length,
+                              const struct iovec* iov, size_t iov_cnt) {
+#ifdef TF_USE_SNAPPY
+  const snappy::iovec* snappy_iov = reinterpret_cast<const snappy::iovec*>(iov);
+  return snappy::RawUncompressToIOVec(compressed, compressed_length, snappy_iov,
+                                      iov_cnt);
+#else
+  return false;
+#endif
+}
+
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 1697e02930c..87810a29dab 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -157,14 +157,17 @@ cc_library(
     hdrs = ["xplane_to_op_stats.h"],
     deps = [
         ":step_events_to_steps_db",
+        ":xplane_to_kernel_stats_db",
         ":xplane_to_op_metrics_db",
         ":xplane_to_step_events",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -206,6 +209,7 @@ cc_library(
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
@@ -296,3 +300,23 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:xplane_utils",
     ],
 )
+
+cc_library(
+    name = "xplane_to_kernel_stats_db",
+    srcs = ["xplane_to_kernel_stats_db.cc"],
+    hdrs = ["xplane_to_kernel_stats_db.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:trace_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+    ],
+)
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index 476f1618412..d518c715a89 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -32,8 +32,10 @@ StepInfoResult ConvertStepDetailsToStepInfo(bool has_device, int64 step_num,
   auto& type_ps = *(generic.mutable_type_ps());
   uint64 total_event_duration = 0;
   for (const auto& event : step_details.Events()) {
-    type_ps[event.type] += event.span.duration_ps();
-    total_event_duration += event.span.duration_ps();
+    // Ignore event duration outside the step marker.
+    uint64 event_duration = step_time.OverlappedDurationPs(event.span);
+    type_ps[event.type] += event_duration;
+    total_event_duration += event_duration;
   }
   if (total_event_duration < step_time.duration_ps()) {
     // Some time in the step is not associated with any event. Classify them as
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
new file mode 100644
index 00000000000..4b87033a508
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/trace_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
+    const XPlane& device_trace,
+    const std::function<void(const XEventVisitor&, KernelReport*)>&
+        on_kernel_fn) {
+  KernelStatsDb result;
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_trace);
+  plane.ForEachLine([&](const XLineVisitor& line) {
+    if (IsDerivedThreadId(line.Id())) {
+      return;
+    }
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      absl::string_view tf_op_fullname;
+      KernelReport kernel;
+
+      event.ForEachStat([&](const tensorflow::profiler::XStatVisitor& stat) {
+        if (stat.Type() == StatType::kLevel0) {
+          tf_op_fullname = stat.StrValue();
+        } else if (stat.Type() == StatType::kKernelDetails) {
+          kernel.set_name(event.Name().data(), event.Name().size());
+          bool using_tensor_cores = IsKernelUsingTensorCore(event.Name());
+          kernel.set_is_kernel_using_tensor_core(using_tensor_cores);
+          kernel.set_total_duration_ns(event.DurationNs());
+          kernel.set_min_duration_ns(event.DurationNs());
+          kernel.set_max_duration_ns(event.DurationNs());
+          ParseKernelLaunchParams(stat.StrValue(), &kernel);
+        }
+      });
+
+      if (!tf_op_fullname.empty()) {
+        tensorflow::profiler::TfOp tf_op = ParseTfOpFullname(tf_op_fullname);
+        if (kernel.total_duration_ns()) {
+          kernel.set_op_name(tf_op.name.data(), tf_op.name.size());
+          bool tensor_core_eligible = IsOpTensorCoreEligible(kernel.op_name());
+#if defined(LOG_IF)
+          LOG_IF(INFO,
+                 !tensor_core_eligible && kernel.is_kernel_using_tensor_core())
+              << "Detected new Op using TensorCores: " << kernel.op_name()
+              << std::endl;
+#endif  // defined(LOG_IF)
+          tensor_core_eligible |= kernel.is_kernel_using_tensor_core();
+          kernel.set_is_op_tensor_core_eligible(tensor_core_eligible);
+        }
+      }
+
+      if (on_kernel_fn) {
+        on_kernel_fn(event, &kernel);
+      }
+
+      if (kernel.total_duration_ns()) {
+        *result.add_reports() = kernel;
+      }
+    });
+  });
+
+  return result;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
new file mode 100644
index 00000000000..04bd0e8ae5f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
+
+#include <functional>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
+    const XPlane& device_trace,
+    const std::function<void(const XEventVisitor&, KernelReport*)>&
+        on_kernel_fn);
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 1025c581b5a..f1182e095e2 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
+#include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -86,6 +89,8 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
   OpMetricsDbCombiner op_metrics_db_combiner(
       op_stats.mutable_device_op_metrics_db());
   SetRunEnvironment(device_planes.size(), op_stats.mutable_run_environment());
+
+  std::vector<KernelReport> reports;
   for (const XPlane* device_trace : device_planes) {
     if (!op_stats.has_perf_env()) {
       *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
@@ -97,7 +102,13 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
     op_metrics_db_combiner.Combine(device_op_metrics_db);
     CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
                       &step_events);
+    KernelStatsDb kernel_stats_db = ConvertDeviceTraceXPlaneToKernelStatsDb(
+        *device_trace, /*on_kernel_fn=*/{});
+    reports.insert(reports.begin(), kernel_stats_db.reports().begin(),
+                   kernel_stats_db.reports().end());
   }
+  GroupKernelReports(&reports, op_stats.mutable_kernel_stats_db());
+  SortKernelsByTotalDurationDesc(op_stats.mutable_kernel_stats_db());
   // Convert a host plane.
   bool has_device = !device_planes.empty();
   if (host_plane) {
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index f3b62e13243..ccc08657842 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
@@ -37,6 +38,7 @@ namespace {
 const absl::string_view kTensorflowStats = "tensorflow_stats";
 const absl::string_view kInputPipeline = "input_pipeline";
 const absl::string_view kOverviewPage = "overview_page";
+const absl::string_view kKernelStats = "kernel_stats";
 
 HardwareType HardwareTypeFromRunEnvironment(const RunEnvironment& run_env) {
   if (run_env.device_type() == "GPU") return HardwareType::GPU;
@@ -85,6 +87,9 @@ void ConvertXSpaceToProfileResponse(const XSpace& xspace,
     TfStatsDatabase tf_stats_db = ConvertOpStatsToTfStats(op_stats);
     AddToolData(ToolName(kTensorflowStats), tf_stats_db, response);
   }
+  if (tools.contains(kKernelStats)) {
+    AddToolData(ToolName(kKernelStats), op_stats.kernel_stats_db(), response);
+  }
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index 4f99225ef29..56684e2e3c9 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -24,8 +24,7 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-// Returns true if the given event_name is a step marker.
-inline bool IsStepMarker(absl::string_view event_name) {
+inline bool IsExplicitHostStepMarker(absl::string_view event_name) {
   return (str_util::StartsWith(event_name, "train") ||
           str_util::StartsWith(event_name, "test") ||
           str_util::StartsWith(event_name, "TraceContext")) &&
@@ -39,7 +38,7 @@ inline bool IsRealCpuCompute(absl::string_view event_name) {
                   str_util::StartsWith(event_name, "EagerLocalExecute") ||
                   str_util::StartsWith(event_name, "EagerKernelExecute") ||
                   str_util::StartsWith(event_name, "FunctionRun") ||
-                  IsStepMarker(event_name);
+                  IsExplicitHostStepMarker(event_name);
   return !not_real;
 }
 
@@ -71,11 +70,13 @@ StepEvents ConvertHostThreadsXLineToStepEvents(
         device_step_events.find(group_id) == device_step_events.end())
       return;
     Timespan timespan = Timespan(event.TimestampPs(), event.DurationPs());
-    // If an explicit step marker is not available, look for an implicit one
-    // which has a step_name stat.
-    if (IsStepMarker(event.Name()) || !step_name.empty()) {
-      result[group_id].AddMarker(
-          StepMarker(/*device=*/false, event.Name(), timespan));
+    if (IsExplicitHostStepMarker(event.Name())) {
+      result[group_id].AddMarker(StepMarker(
+          StepMarkerType::kExplicitHostStepMarker, event.Name(), timespan));
+    } else if (!step_name.empty()) {
+      // Grouping adds a step_name stat to implicit host step markers.
+      result[group_id].AddMarker(StepMarker(
+          StepMarkerType::kImplicitHostStepMarker, event.Name(), timespan));
     } else if (IsRealCpuCompute(event.Name())) {
       EventTypeSpan event_type_span(
           ClassifyCpuEvent(event.Name(), correlation_id), timespan);
@@ -98,6 +99,21 @@ StepEvents ConvertHostThreadsXPlaneToStepEvents(
   return result;
 }
 
+StepEvents ConvertDeviceStepInfoToStepMarkers(const XLineVisitor& line) {
+  StepEvents result;
+  line.ForEachEvent([&](const XEventVisitor& event) {
+    event.ForEachStat([&](const XStatVisitor& stat) {
+      if (stat.Type() == StatType::kGroupId) {
+        result[stat.IntValue()].AddMarker(
+            StepMarker(StepMarkerType::kDeviceStepMarker, event.Name(),
+                       Timespan(event.TimestampPs(), event.DurationPs())));
+        return;
+      }
+    });
+  });
+  return result;
+}
+
 StepEvents ConvertDeviceTraceXLineToStepEvents(const XLineVisitor& line) {
   StepEvents result;
   line.ForEachEvent([&](const XEventVisitor& event) {
@@ -128,8 +144,14 @@ StepEvents ConvertDeviceTraceXPlaneToStepEvents(const XPlane& device_trace) {
   StepEvents result;
   XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_trace);
   plane.ForEachLine([&](const XLineVisitor& line) {
-    if (IsDerivedThreadId(line.Id())) return;
-    CombineStepEvents(ConvertDeviceTraceXLineToStepEvents(line), &result);
+    int64 line_id = line.Id();
+    if (line_id == kThreadIdStepInfo) {
+      CombineStepEvents(ConvertDeviceStepInfoToStepMarkers(line), &result);
+    } else if (IsDerivedThreadId(line_id)) {
+      return;
+    } else {
+      CombineStepEvents(ConvertDeviceTraceXLineToStepEvents(line), &result);
+    }
   });
   return result;
 }
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 50a901f3670..9494c6fe233 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -56,7 +56,14 @@ bool IsHostEvent(const CuptiTracerEvent& event) {
 }
 
 void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
-                  XLineBuilder* line) {
+                  uint64 start_gpu_ns, uint64 end_gpu_ns, XLineBuilder* line) {
+  if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
+      event.start_time_ns > event.end_time_ns) {
+    VLOG(2) << "events have abnormal timestamps:" << event.name
+            << " start time(ns): " << event.start_time_ns
+            << " end time(ns): " << event.end_time_ns;
+    return;
+  }
   std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
   XEventMetadata* event_metadata = plane->GetOrCreateEventMetadata(kernel_name);
   XEventBuilder xevent = line->AddEvent(*event_metadata);
@@ -203,14 +210,15 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
               << " callback api events and " << num_activity_events_
               << " activity events.";
+    uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
     XPlaneBuilder host_plane(GetOrCreatePlane(space, kCuptiDriverApiPlaneName));
     host_plane.SetId(kCuptiDriverApiPlaneId);
     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
       std::string name = absl::StrCat(kGpuPlanePrefix, device_ordinal);
       XPlaneBuilder device_plane(GetOrCreatePlane(space, name));
       device_plane.SetId(kGpuPlaneBaseId + device_ordinal);
-      per_device_collector_[device_ordinal].Flush(start_gpu_ns_, &device_plane,
-                                                  &host_plane);
+      per_device_collector_[device_ordinal].Flush(start_gpu_ns_, end_gpu_ns,
+                                                  &device_plane, &host_plane);
       per_device_collector_[device_ordinal].GetDeviceCapabilities(
           device_ordinal, &device_plane);
       NormalizeTimeStamps(&device_plane, start_walltime_ns_);
@@ -348,8 +356,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       events.clear();
     }
 
-    void Flush(uint64 start_gpu_ns, XPlaneBuilder* device_plane,
-               XPlaneBuilder* host_plane) {
+    void Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
+               XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
       absl::MutexLock lock(&mutex);
 
       // Tracking event types per line.
@@ -365,7 +373,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
         auto* plane = is_host_event ? host_plane : device_plane;
         XLineBuilder line = plane->GetOrCreateLine(line_id);
         line.SetTimestampNs(start_gpu_ns);
-        CreateXEvent(event, plane, &line);
+        CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
         events_types_per_line[line_id].emplace(event.type);
       }
       device_plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index 80d62faadf8..30289f247f3 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -67,6 +67,7 @@ tf_proto_library(
     srcs = ["op_stats.proto"],
     cc_api_version = 2,
     protodeps = [
+        ":kernel_stats_proto",
         ":op_metrics_proto",
         ":steps_db_proto",
     ],
@@ -75,6 +76,13 @@ tf_proto_library(
     ],
 )
 
+tf_proto_library(
+    name = "kernel_stats_proto",
+    srcs = ["kernel_stats.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
+
 # This proto is deprecating and not guarenteed to be compatible across versions.
 # Please don't refer in new project unless you are double confirmed.
 tf_proto_library(
diff --git a/tensorflow/core/profiler/protobuf/kernel_stats.proto b/tensorflow/core/profiler/protobuf/kernel_stats.proto
new file mode 100644
index 00000000000..144ec9acb8a
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/kernel_stats.proto
@@ -0,0 +1,37 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+message KernelReport {
+  // Name of the kernel.
+  string name = 1;
+  // Registers per thread.
+  uint32 registers_per_thread = 2;
+  // Static shared memory in bytes.
+  uint32 static_shmem_bytes = 3;
+  // Dynamic shared memory in bytes.
+  uint32 dynamic_shmem_bytes = 4;
+  // Block dimensions.
+  repeated uint32 block_dim = 5;
+  // Grid dimensions.
+  repeated uint32 grid_dim = 6;
+  // Total duration of this kernel.
+  uint64 total_duration_ns = 7;
+  // Min duration of kernel in nanoseconds.
+  uint64 min_duration_ns = 8;
+  // Max duration of kernel in nanoseconds.
+  uint64 max_duration_ns = 9;
+  // Kernel utilizes TensorCore instructions.
+  bool is_kernel_using_tensor_core = 10;
+  // Operation is eligible to use TensorCores.
+  bool is_op_tensor_core_eligible = 11;
+  // TF operation name.
+  string op_name = 12;
+  // Number of occurrences.
+  uint32 occurrences = 13;
+}
+
+message KernelStatsDb {
+  // A list of kernels aggregated by name.
+  repeated KernelReport reports = 1;
+}
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index a3926bea7b5..8a7df7f60c3 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
+import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
 import "tensorflow/core/profiler/protobuf/op_metrics.proto";
 import "tensorflow/core/profiler/protobuf/steps_db.proto";
 
@@ -99,4 +100,6 @@ message OpStats {
   StepDatabaseResult step_db = 4;
   // The run environment of this profiling session.
   RunEnvironment run_environment = 5;
+  // Kernel stats results from all GPUs.
+  KernelStatsDb kernel_stats_db = 6;
 }
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index 0930aab8c96..87a71bb9ff2 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -149,8 +149,8 @@ Status NewSession(const string& service_addr,
   // TODO(jiesun): GRPC support following relevant naming scheme:
   // 1. dns:///host:port
   // 2. ipv4:host:port or ipv6:[host]:port
-  // We might need to change the prefix which depends on what TPU name resolver
-  // will give us.
+  // We might need to change the prefix which depends on what cluster name
+  // resolver will give us.
   std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
       grpc::ProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
           "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
@@ -191,7 +191,7 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-// Starts tracing on a single or multiple TPU hosts and saves the result in the
+// Starts tracing on a single or multiple hosts and saves the result in the
 // given logdir. If no trace was collected, retries tracing for
 // num_tracing_attempts.
 Status Trace(const string& service_addr, const string& logdir,
@@ -211,14 +211,14 @@ Status Trace(const string& service_addr, const string& logdir,
   ProfileOptions opts;
   opts.set_include_dataset_ops(include_dataset_ops);
   while (true) {
-    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+    std::cout << "Starting to trace for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << --remaining_attempts << std::endl;
     if (hostnames.empty()) {
       status = Profile(service_addr, logdir, duration_ms, repository_root,
                        session_id, opts);
     } else {
-      string tpu_master = service_addr;
-      status = NewSession(tpu_master, hostnames, duration_ms, repository_root,
+      string master = service_addr;
+      status = NewSession(master, hostnames, duration_ms, repository_root,
                           session_id, opts);
     }
     if (remaining_attempts <= 0 || status.ok() || !ShouldRetryTracing(status))
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index d81d1509a84..69588cf3f68 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -49,7 +49,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
 
   ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
                          ProfileResponse* response) override {
-    LOG(INFO) << "Received a profile request: " << req->DebugString();
+    VLOG(1) << "Received a profile request: " << req->DebugString();
     std::unique_ptr<ProfilerSession> profiler = ProfilerSession::Create();
     Status status = profiler->Status();
     if (!status.ok()) {
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 07d5598171e..4dcbf38c1bd 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -304,3 +304,14 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "kernel_stats_utils",
+    srcs = ["kernel_stats_utils.cc"],
+    hdrs = ["kernel_stats_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index b94d756020f..3e277e11aa9 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -32,6 +32,13 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+// TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
+// to look up tensorflow op name from hlo_module/hlo_op.
+absl::string_view DummySymbolResolver(absl::string_view hlo_module,
+                                      absl::string_view hlo_op) {
+  return absl::string_view();
+}
+
 // Helper for deriving an XLine from events in another XLine.
 class DerivedXLineBuilder {
  public:
@@ -147,7 +154,7 @@ void ProcessTfOpEvent(const XEventVisitor& event,
 
 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
                                  const EventGroupNameMap& event_group_name_map,
-                                 XPlane* device_trace) {
+                                 XPlane* device_trace, bool step_info_only) {
   // Merge and sort events by Timespan as they come from different lines.
   std::vector<XEventVisitor> events;
   uint64 start_timestamp_ns = 0;
@@ -204,6 +211,8 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
       }
     }
 
+    if (step_info_only) continue;
+
     // For HLO/TF op lines, only use kernel events (i.e. excluding memcpy or
     // allocation events).
     if (!is_kernel) continue;
@@ -331,17 +340,21 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
 }
 
 void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
-                              XSpace* space) {
-  // TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
-  // to look up tensorflow op name from hlo_module/hlo_op.
-  auto symbol_resolver = [&](absl::string_view hlo_module,
-                             absl::string_view hlo_op) -> absl::string_view {
-    return absl::string_view();
-  };
+                              XSpace* space, bool step_info_only) {
   for (XPlane& plane : *space->mutable_planes()) {
     // Derived timelines only generated for device traces.
     if (plane.id() == kHostPlaneId) continue;
-    DeriveEventsFromAnnotations(symbol_resolver, event_group_name_map, &plane);
+    DeriveEventsFromAnnotations(DummySymbolResolver, event_group_name_map,
+                                &plane, step_info_only);
+  }
+}
+
+void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
+                              const std::vector<XPlane*>& device_traces,
+                              bool step_info_only) {
+  for (XPlane* plane : device_traces) {
+    DeriveEventsFromAnnotations(DummySymbolResolver, event_group_name_map,
+                                plane, step_info_only);
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 5a99251a57c..0b5118ae6d9 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -33,7 +33,8 @@ typedef std::function<absl::string_view(absl::string_view hlo_module_name,
 // The device_trace is both input and output.
 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
                                  const EventGroupNameMap& event_group_name_map,
-                                 XPlane* device_trace);
+                                 XPlane* device_trace,
+                                 bool step_info_only = false);
 
 // Derives "Launch Activities Summary" line from host trace.
 void DeriveEventsFromHostTrace(const XPlane* host_trace,
@@ -43,7 +44,10 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
 // Loops through XPlanes of input XSpace, if it is "device" XPlane, generating
 // derived timelines for the plane by calling DeriveEventsFromAnnotations.
 void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
-                              XSpace* space);
+                              XSpace* space, bool step_info_only = false);
+void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
+                              const std::vector<XPlane*>& device_traces,
+                              bool step_info_only = false);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 6e892a5d8e2..79b5c569e03 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
@@ -219,9 +220,20 @@ std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span) {
                       event_type_span.span.DebugString(), ")");
 }
 
+absl::string_view PrintStepMarkerType(StepMarkerType type) {
+  switch (type) {
+    case StepMarkerType::kExplicitHostStepMarker:
+      return "ExplicitHostStepMarker";
+    case StepMarkerType::kImplicitHostStepMarker:
+      return "ImplicitHostStepMarker";
+    case StepMarkerType::kDeviceStepMarker:
+      return "DeviceStepMarker";
+  }
+}
+
 std::string PrintStepMarker(const StepMarker& step_marker) {
-  std::string device_or_host = step_marker.on_device ? "device" : "host";
-  return absl::StrCat("(", device_or_host, ", ", step_marker.event_name, ", ",
+  return absl::StrCat("(", PrintStepMarkerType(step_marker.type), ", ",
+                      step_marker.event_name, ", ",
                       step_marker.span.DebugString(), ")");
 }
 
@@ -287,15 +299,24 @@ void StepDetails::AppendEvents(const std::vector<EventTypeSpan>& other_events) {
 }
 
 Timespan StepDetails::StepTime() const {
-  // If there are multiple step-markers, uses the one that has the maximum
-  // duration.
-  Timespan max_steptime;
+  Timespan max_host_step_time;
+  Timespan max_device_step_time;
   for (const auto& marker : markers_) {
-    const Timespan& timespan = marker.span;
-    if (timespan.duration_ps() > max_steptime.duration_ps())
-      max_steptime = timespan;
+    Timespan& cur_max_step_time =
+        marker.type == StepMarkerType::kDeviceStepMarker ? max_device_step_time
+                                                         : max_host_step_time;
+    const Timespan& new_step_time = marker.span;
+    if (new_step_time.duration_ps() > cur_max_step_time.duration_ps())
+      cur_max_step_time = new_step_time;
   }
-  return max_steptime;
+  // If the host step time includes the device step time, use the host step
+  // time. This covers two cases: (1) the device step marker is not available
+  // (e.g., CPU-only profiles) and (2) the device is synchronized at the end of
+  // each step.
+  if (max_host_step_time.Includes(max_device_step_time)) {
+    return max_host_step_time;
+  }
+  return max_device_step_time;
 }
 
 std::string StepDetails::DebugString() const {
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 095d07a5dd5..b6590be6454 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -76,16 +76,28 @@ struct EventTypeSpan {
   }
 };
 
+enum class StepMarkerType {
+  // "TraceContext" TraceMe events.
+  kExplicitHostStepMarker,
+  // Identified by group_events (e.g., FunctionRun, SessionRun).
+  kImplicitHostStepMarker,
+  // Derived from the result of group_events. A device step marker starts with
+  // the first device event of the group and ends with the last event of the
+  // group.
+  kDeviceStepMarker,
+};
+
 // Record of an event that is used as a step marker.
 struct StepMarker {
-  bool on_device;          // true if this event happened on device.
+  StepMarkerType type;
   std::string event_name;  // name of this event.
   Timespan span;           // timespan of this event.
-  StepMarker(bool device, absl::string_view name, Timespan s)
-      : on_device(device), event_name(name), span(s) {}
+  StepMarker(StepMarkerType step_marker_type, absl::string_view name,
+             Timespan s)
+      : type(step_marker_type), event_name(name), span(s) {}
   // Equality test.
   bool operator==(const StepMarker& other) const {
-    return on_device == other.on_device && event_name == other.event_name &&
+    return type == other.type && event_name == other.event_name &&
            span == other.span;
   }
   // Inequality test.
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 3c0b7d50f56..3c74e13bd79 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -79,6 +79,66 @@ void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
                      event);
 }
 
+using VirtualEventContainer = std::vector<std::unique_ptr<XEvent>>;
+
+using VirtualEventNodeMap =
+    absl::flat_hash_map<int64 /*step_id*/,
+                        absl::flat_hash_map<int64 /*iter_num*/, EventNode*>>;
+
+std::unique_ptr<XEvent> CreateVirtualEvent(const XStat& step_id_stat,
+                                           const XStat& iter_num_stat) {
+  auto virtual_event = absl::make_unique<XEvent>();
+  *virtual_event->add_stats() = step_id_stat;
+  *virtual_event->add_stats() = iter_num_stat;
+  return virtual_event;
+}
+
+// Create virtual events of HostEventType::kHostTrainingLoopIteration and event
+// nodes for them. A virtual event is created for each iteration of the host
+// training loop and connected to the HostEventType::kExecutorStateProcess event
+// nodes of the iteration.
+void CreateVirtualEvents(EventNodeMap* event_node_map,
+                         VirtualEventContainer* virtual_event_container) {
+  VirtualEventNodeMap virtual_event_node_map;
+  auto executor_event_node_list =
+      gtl::FindOrNull(*event_node_map, HostEventType::kExecutorStateProcess);
+  if (!executor_event_node_list) return;
+  for (auto& executor_event_node : *executor_event_node_list) {
+    const XStat* step_id_stat =
+        executor_event_node->GetContextStat(StatType::kStepId);
+    const XStat* iter_num_stat =
+        executor_event_node->GetContextStat(StatType::kIterNum);
+    if (!step_id_stat || !iter_num_stat) continue;
+    int64 step_id = step_id_stat->int64_value();
+    int64 iter_num = iter_num_stat->int64_value();
+    // Process the event with nonzero iter_num only to filter out the events
+    // related to tf.data.
+    // TODO(jihochoi): Filter out tf.data events more reliably.
+    if (!iter_num) continue;
+    EventNode*& virtual_event_node = virtual_event_node_map[step_id][iter_num];
+    if (!virtual_event_node) {
+      std::unique_ptr<XEvent> new_virtual_event =
+          CreateVirtualEvent(*step_id_stat, *iter_num_stat);
+      auto new_virtual_event_node = absl::make_unique<EventNode>(
+          &executor_event_node->GetPlaneVisitor(), new_virtual_event.get());
+      // virtual_event_container keeps new_virtual_event alive.
+      virtual_event_container->push_back(std::move(new_virtual_event));
+      virtual_event_node = new_virtual_event_node.get();
+      // event_node_map keeps new_virtual_event_node alive.
+      (*event_node_map)[HostEventType::kHostTrainingLoopIteration].push_back(
+          std::move(new_virtual_event_node));
+    }
+    virtual_event_node->AddChild(executor_event_node.get());
+  }
+}
+
+bool NeedsVirtualEvents(
+    const std::vector<int64 /*EventType*/>& root_event_types) {
+  return std::find(root_event_types.begin(), root_event_types.end(),
+                   HostEventType::kHostTrainingLoopIteration) !=
+         root_event_types.end();
+}
+
 }  // namespace
 
 const XStat* EventNode::GetContextStat(int64 stat_type) const {
@@ -100,7 +160,7 @@ std::string EventNode::GetGroupName() const {
     step_num = step_num_stat->int64_value();
   }
   if (const XStat* iter_num_stat = GetContextStat(StatType::kIterNum)) {
-    step_num += iter_num_stat->int64_value();
+    step_num = iter_num_stat->int64_value();
   }
   name_parts.push_back(absl::StrCat(step_num));
   return absl::StrJoin(name_parts, " ");
@@ -139,6 +199,7 @@ void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
         }
       }
       parent_nodes.push_back(cur_node.get());
+      // event_node_map keeps cur_node alive.
       (*event_node_map)[GetEventType(visitor, event)].push_back(
           std::move(cur_node));
     }
@@ -215,6 +276,8 @@ void GroupEvents(const std::vector<InterThreadConnectInfo>& connect_info_list,
                  const std::vector<int64>& root_event_types, XSpace* space,
                  EventGroupNameMap* event_group_name_map) {
   EventNodeMap event_node_map;
+  // Keeps virtual events alive for this scope.
+  VirtualEventContainer virtual_event_container;
   std::vector<XPlaneVisitor> visitors;
   visitors.reserve(space->planes_size());
   for (auto& plane : *space->mutable_planes()) {
@@ -223,6 +286,9 @@ void GroupEvents(const std::vector<InterThreadConnectInfo>& connect_info_list,
     ConnectIntraThread(visitors.back(), &plane, &event_node_map);
   }
   ConnectInterThread(event_node_map, connect_info_list);
+  if (NeedsVirtualEvents(root_event_types)) {
+    CreateVirtualEvents(&event_node_map, &virtual_event_container);
+  }
   CreateEventGroup(root_event_types, event_node_map, event_group_name_map);
 }
 
@@ -241,8 +307,8 @@ void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map) {
         HostEventType::kKernelExecute,
         {StatType::kCorrelationId}}});
   const std::vector<int64 /*EventType*/> root_event_types(
-      {HostEventType::kTraceContext, HostEventType::kFunctionRun,
-       HostEventType::kSessionRun});
+      {HostEventType::kHostTrainingLoopIteration, HostEventType::kTraceContext,
+       HostEventType::kFunctionRun, HostEventType::kSessionRun});
   GroupEvents(connect_info_list, root_event_types, space, event_group_name_map);
 }
 
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 23931da2cb2..8404c42af6d 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <memory>
+
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -58,6 +60,8 @@ class EventNode {
   // Sets group_id for this node and its descendants.
   void PropagateGroupId(int64 group_id);
 
+  const XPlaneVisitor& GetPlaneVisitor() const { return *visitor_; }
+
   const XEvent& GetEvent() const { return *event_; }
 
   const XStat* GetContextStat(int64 stat_type) const;
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index fc9f5c1987c..970d20385b7 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -134,17 +134,8 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, 100}});
 
-  std::vector<InterThreadConnectInfo> connect_info_list(
-      {{HostEventType::kFunctionRun,
-        HostEventType::kExecutorStateProcess,
-        {StatType::kStepId}},
-       {HostEventType::kKernelLaunch,
-        HostEventType::kKernelExecute,
-        {StatType::kCorrelationId}}});
   EventGroupNameMap event_group_name_map;
-  GroupEvents(connect_info_list,
-              {HostEventType::kTraceContext, HostEventType::kFunctionRun},
-              &space, &event_group_name_map);
+  GroupTfEvents(&space, &event_group_name_map);
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 2);
   EXPECT_EQ(device_plane_visitor.GetStatType(
@@ -154,6 +145,38 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
   EXPECT_EQ(event_group_name_map[0], "123");
 }
 
+TEST(GroupEventsTest, GroupHostTrainingLoopTest) {
+  XSpace space;
+  XPlaneBuilder host_plane_builder(space.add_planes());
+  host_plane_builder.SetName(kHostThreads);
+  host_plane_builder.ReserveLines(1);
+
+  auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&host_plane_builder, &tf_executor_thread,
+               HostEventType::kExecutorStateProcess, 20, 80,
+               {{StatType::kStepId, 0}, {StatType::kIterNum, 10}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70,
+               {{StatType::kCorrelationId, 100}});
+
+  XPlane* device_plane = space.add_planes();
+  XPlaneBuilder device_plane_builder(device_plane);
+  device_plane_builder.ReserveLines(1);
+
+  auto stream = device_plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
+               {{StatType::kCorrelationId, 100}});
+
+  EventGroupNameMap event_group_name_map;
+  GroupTfEvents(&space, &event_group_name_map);
+  XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
+  EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 2);
+  EXPECT_EQ(device_plane_visitor.GetStatType(
+                device_plane->lines(0).events(0).stats(1)),
+            StatType::kGroupId);
+  EXPECT_EQ(event_group_name_map.size(), 1);
+  EXPECT_EQ(event_group_name_map[0], "10");
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
new file mode 100644
index 00000000000..665e802229d
--- /dev/null
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -0,0 +1,209 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
+                             KernelReport* kernel) {
+  const std::vector<absl::string_view> params =
+      absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
+
+  constexpr uint32_t kNumDimensions = 3;
+  for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
+    kernel->add_block_dim(1);
+    kernel->add_grid_dim(1);
+  }
+
+  // Process value pairs.
+  for (uint32_t ii = 0; ii < params.size(); ii += 2) {
+    uint32_t value = 0;
+    if (params[ii] == "registers_per_thread" &&
+        absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->set_registers_per_thread(value);
+    } else if (params[ii] == "static_shared_memory_usage" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->set_static_shmem_bytes(value);
+    } else if (params[ii] == "dynamic_shared_memory_usage" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->set_dynamic_shmem_bytes(value);
+    } else if (params[ii] == "block_x" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_block_dim()->Set(0, value);
+    } else if (params[ii] == "block_y" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_block_dim()->Set(1, value);
+    } else if (params[ii] == "block_z" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_block_dim()->Set(2, value);
+    } else if (params[ii] == "grid_x" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_grid_dim()->Set(0, value);
+    } else if (params[ii] == "grid_y" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_grid_dim()->Set(1, value);
+    } else if (params[ii] == "grid_z" &&
+               absl::SimpleAtoi(params[ii + 1], &value)) {
+      kernel->mutable_grid_dim()->Set(2, value);
+    }
+  }
+}
+
+bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
+  // Some examples: volta_h884gemm, volta_fp16_s884gemm,
+  // turing_fp16_s1688cudnn_fp16
+  bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
+                                absl::StrContains(kernel_name, "1688");
+#if defined(VLOG_IF)
+  VLOG_IF(1, possible_tensor_kernel)
+      << "Possible tensor kernel: " << kernel_name << "\n";
+#endif  // defined(VLOG_IF)
+
+  return (absl::StartsWith(kernel_name, "volta_i884") ||
+          absl::StartsWith(kernel_name, "volta_h884") ||
+          absl::StartsWith(kernel_name, "volta_s884") ||
+          absl::StartsWith(kernel_name, "volta_fp16_i884") ||
+          absl::StartsWith(kernel_name, "volta_fp16_h884") ||
+          absl::StartsWith(kernel_name, "volta_fp16_s884") ||
+          absl::StartsWith(kernel_name, "turing_i1688") ||
+          absl::StartsWith(kernel_name, "turing_h1688") ||
+          absl::StartsWith(kernel_name, "turing_s1688") ||
+          absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
+          absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
+          absl::StartsWith(kernel_name, "turing_fp16_s1688"));
+}
+
+// This list is not exhaustive.
+bool IsOpTensorCoreEligible(absl::string_view tf_op_name) {
+  return (absl::StrContains(tf_op_name, "Conv") ||
+          absl::StrContains(tf_op_name, "Einsum"));
+}
+
+bool KernelReportLessThanComparator::operator()(const KernelReport& lhs,
+                                                const KernelReport& rhs) {
+  // Disable formatting to keep vertical alignment for better readability,
+  // and make it easier to reorder columns.
+  // clang-format off
+  auto lhs_tuple = std::make_tuple(
+      lhs.name(),
+      lhs.grid_dim(0),
+      lhs.grid_dim(1),
+      lhs.grid_dim(2),
+      lhs.block_dim(0),
+      lhs.block_dim(1),
+      lhs.block_dim(2),
+      lhs.registers_per_thread(),
+      lhs.static_shmem_bytes(),
+      lhs.dynamic_shmem_bytes(),
+      lhs.is_kernel_using_tensor_core(),
+      lhs.is_op_tensor_core_eligible(),
+      lhs.op_name());
+
+  auto rhs_tuple = std::make_tuple(
+      rhs.name(),
+      rhs.grid_dim(0),
+      rhs.grid_dim(1),
+      rhs.grid_dim(2),
+      rhs.block_dim(0),
+      rhs.block_dim(1),
+      rhs.block_dim(2),
+      rhs.registers_per_thread(),
+      rhs.static_shmem_bytes(),
+      rhs.dynamic_shmem_bytes(),
+      rhs.is_kernel_using_tensor_core(),
+      rhs.is_op_tensor_core_eligible(),
+      rhs.op_name());
+  // clang-format on
+  return lhs_tuple < rhs_tuple;
+}
+
+bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
+                                               const KernelReport& rhs) {
+  // Disable formatting to keep vertical alignment for better readability,
+  // and make it easier to reorder columns.
+  // clang-format off
+  // Put the most expensive string comparisons last.
+  return (
+      lhs.is_kernel_using_tensor_core() == rhs.is_kernel_using_tensor_core() &&
+      lhs.is_op_tensor_core_eligible() == rhs.is_op_tensor_core_eligible() &&
+      lhs.block_dim(0) == rhs.block_dim(0) &&
+      lhs.block_dim(1) == rhs.block_dim(1) &&
+      lhs.block_dim(2) == rhs.block_dim(2) &&
+      lhs.grid_dim(0) == rhs.grid_dim(0) &&
+      lhs.grid_dim(1) == rhs.grid_dim(1) &&
+      lhs.grid_dim(2) == rhs.grid_dim(2) &&
+      lhs.registers_per_thread() == rhs.registers_per_thread() &&
+      lhs.static_shmem_bytes() == rhs.static_shmem_bytes() &&
+      lhs.dynamic_shmem_bytes() == rhs.dynamic_shmem_bytes() &&
+      lhs.name() == rhs.name() &&
+      lhs.op_name() == rhs.op_name());
+  // clang-format on
+}
+
+void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db) {
+  // Sort kernel reports by total duration descendingly.
+  std::sort(kernel_stats_db->mutable_reports()->begin(),
+            kernel_stats_db->mutable_reports()->end(),
+            [](const KernelReport& lhs, const KernelReport& rhs) {
+              return lhs.total_duration_ns() > rhs.total_duration_ns() ||
+                     (lhs.total_duration_ns() == rhs.total_duration_ns() &&
+                      KernelReportLessThanComparator()(lhs, rhs));
+            });
+}
+
+void GroupKernelReports(std::vector<KernelReport>* reports,
+                        KernelStatsDb* dst) {
+  // Sort reports by grouping criteria.
+  std::sort(reports->begin(), reports->end(), KernelReportLessThanComparator());
+
+  // Group reports together.
+  KernelReport* prev = nullptr;
+  for (const KernelReport& report : *reports) {
+    DCHECK_EQ(3, report.grid_dim_size());
+    DCHECK_EQ(3, report.block_dim_size());
+    if (prev != nullptr && KernelReportEqualToComparator()(*prev, report)) {
+      // Previous element is identical to the one that we are adding, so
+      // aggregate them.
+      prev->set_occurrences(prev->occurrences() + 1);
+      prev->set_max_duration_ns(
+          std::max(prev->max_duration_ns(), report.max_duration_ns()));
+      prev->set_min_duration_ns(
+          std::min(prev->min_duration_ns(), report.min_duration_ns()));
+      prev->set_total_duration_ns(prev->total_duration_ns() +
+                                  report.total_duration_ns());
+    } else {
+      // Current element does not exist yet.
+      prev = dst->add_reports();
+      *prev = report;
+      prev->set_occurrences(1);
+    }
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.h b/tensorflow/core/profiler/utils/kernel_stats_utils.h
new file mode 100644
index 00000000000..7b121b49e85
--- /dev/null
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Populates kernel launch information from a KernelDetails XStat.
+void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
+                             KernelReport* kernel);
+
+// Returns true if kernel uses TensorCores.
+bool IsKernelUsingTensorCore(absl::string_view kernel_name);
+
+// Returns true if operation is eligible to use TensorCores.
+bool IsOpTensorCoreEligible(absl::string_view tf_op_name);
+
+// Less than comparator for Kernel Reports.
+struct KernelReportLessThanComparator {
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs);
+};
+
+// Equal to comparator for Kernel Reports.
+struct KernelReportEqualToComparator {
+  bool operator()(const KernelReport& lhs, const KernelReport& rhs);
+};
+
+// Sorts kernel reorts by total duration descendingly.
+void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db);
+
+// Groups and aggregate common reports into destination KernelStatsDb.
+void GroupKernelReports(std::vector<KernelReport>* reports, KernelStatsDb* dst);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_KERNEL_STATS_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 3b9531ea6e0..9a20cc51c55 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -82,6 +82,8 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"PartitionedCallOp", kPartitionedCallOp},
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
+      // Virtual events for grouping.
+      {"HostTrainingLoopIteration", kHostTrainingLoopIteration},
       // GPU related.
       {"KernelLaunch", kKernelLaunch},
       {"KernelExecute", kKernelExecute},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 5e059154afd..51a94e5c760 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -76,6 +76,8 @@ enum HostEventType {
   kPartitionedCallOp,
   // tf.data related.
   kIteratorGetNextOp,
+  // Virtual events for grouping.
+  kHostTrainingLoopIteration,
   // GPU related.
   kKernelLaunch,
   kKernelExecute,
diff --git a/tensorflow/core/protobuf/data/experimental/snapshot.proto b/tensorflow/core/protobuf/data/experimental/snapshot.proto
index 422602d3760..e013deb2ee1 100644
--- a/tensorflow/core/protobuf/data/experimental/snapshot.proto
+++ b/tensorflow/core/protobuf/data/experimental/snapshot.proto
@@ -3,6 +3,8 @@ syntax = "proto3";
 package tensorflow.data.experimental;
 
 import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
 
 // Each SnapshotRecord represents one batch of pre-processed input data. A batch
 // consists of a list of tensors that we encode as TensorProtos. This message
@@ -13,9 +15,29 @@ message SnapshotRecord {
 
 // This stores the metadata information present in each snapshot record.
 message SnapshotMetadataRecord {
+  // Stores the fingerprint of the graph that describes the dataset that is
+  // snapshotted.
   string graph_hash = 1;
+  // Run ID that this snapshot corresponds to.
   string run_id = 2;
+  // Time when we started creating this snapshot.
   int64 creation_timestamp = 3;
+  // Version of the snapshot data file format.
+  int64 version = 4;
+  // A list of tensor dtype corresponding to each element of the snapshot.
+  repeated .tensorflow.DataType dtype = 5;
 
   bool finalized = 1000;
 }
+
+// Metadata for a single tensor in the Snapshot Record.
+message TensorMetadata {
+  .tensorflow.TensorShapeProto tensor_shape = 2;
+  // Number of uncompressed bytes used to store the tensor representation.
+  int64 tensor_size_bytes = 3;
+}
+
+// Metadata for all the tensors in a Snapshot Record.
+message SnapshotTensorMetadata {
+  repeated TensorMetadata tensor_metadata = 1;
+}
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index cd2a7c7a24f..a68f487f8f1 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -56,9 +56,9 @@ message QueueItem {
     RegisterFunctionOp register_function = 4;
     CleanupFunctionOp cleanup_function = 5;
     // A remote executor is created to execute ops/functions asynchronously
-    // enqueued in streaming call. Request with this item type clears pending
-    // nodes and status of the executor on the remote worker.
-    ClearRemoteExecutorForStream clear_remote_executor_for_stream = 6;
+    // enqueued in streaming call. Request with this item type waits for pending
+    // nodes to finish on the remote executor and report status.
+    SyncRemoteExecutorForStream sync_remote_executor_for_stream = 6;
   }
 }
 
@@ -196,7 +196,7 @@ message CleanupFunctionOp {
   int64 step_id = 1;
 }
 
-message ClearRemoteExecutorForStream {}
+message SyncRemoteExecutorForStream {}
 
 message SendTensorOp {
   // All remote tensors are identified by <Op ID, Output num>. To mimic this
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 8d90bd67667..24bdb019fa6 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -44,6 +44,55 @@ namespace {
 template <typename T>
 using SmallVector = gtl::InlinedVector<T, 4>;
 
+template <typename T>
+class LimitedArraySlice {
+ public:
+  using value_type = T;
+
+  LimitedArraySlice(T* begin, size_t num_elements)
+      : current_(begin), begin_(begin), end_(begin + num_elements) {}
+
+  // May return negative if there were push_back calls after slice was filled.
+  int64 EndDistance() const { return end_ - current_; }
+
+  // Attempts to push value to the back of this. If the slice has
+  // already been filled, this method has no effect on the underlying data, but
+  // it changes the number returned by EndDistance into negative values.
+  void push_back(T&& value) {
+    if (EndDistance() > 0) *current_ = std::move(value);
+    ++current_;
+  }
+
+  // "Constructs" an element at the back of this by resizing the slice, and
+  // returns a mutable reference to the new last element.
+  // REQUIRES: EndDistance() > 0.
+  T& construct_at_end() {
+    DCHECK_GT(EndDistance(), 0);
+    return *(current_++);
+  }
+
+  // Returns a mutable reference to the last element in the slice.
+  // REQUIRES: size() > 0.
+  T& back() { return *(current_ - 1); }
+
+  // Returns the number of elements in the slice.
+  size_t size() const { return std::min(current_ - begin_, end_ - begin_); }
+
+  // Attempts to resize the vector to the given size. It does so by advancing
+  // the pointer to the current element, possibly beyond the end of the slice.
+  // As a consequence, calling `size()` after `resize(x)` was called might
+  // return a value less than `x`.
+  void resize(size_t size) { current_ = begin_ + size; }
+
+  // Returns the pointer to the underlying data buffer.
+  T* data() { return begin_; }
+
+ private:
+  T* current_;
+  T* begin_;
+  T* end_;
+};
+
 template <typename A>
 auto EnableAliasing(A* a) -> decltype(a->EnableAliasing(true), void()) {
   a->EnableAliasing(true);
@@ -117,6 +166,14 @@ class Feature {
     return true;
   }
 
+  // Helper methods
+  tstring& construct_at_end(LimitedArraySlice<tstring>* bytes_list) {
+    return bytes_list->construct_at_end();
+  }
+  tstring& construct_at_end(SmallVector<tstring>* bytes_list) {
+    return bytes_list->emplace_back();
+  }
+
   template <typename Result>
   bool ParseBytesList(Result* bytes_list) {
     DCHECK(bytes_list != nullptr);
@@ -135,8 +192,7 @@ class Feature {
       // parse string
       uint32 bytes_length;
       if (!stream.ReadVarint32(&bytes_length)) return false;
-      bytes_list->push_back({});
-      tstring& bytes = bytes_list->back();
+      tstring& bytes = construct_at_end(bytes_list);
       bytes.resize_uninitialized(bytes_length);
       if (!stream.ReadRaw(bytes.data(), bytes_length)) return false;
     }
@@ -486,47 +542,6 @@ struct SeededHasher {
   uint64 seed{0xDECAFCAFFE};
 };
 
-template <typename T>
-class LimitedArraySlice {
- public:
-  using value_type = T;
-
-  LimitedArraySlice(T* begin, size_t num_elements)
-      : current_(begin), begin_(begin), end_(begin + num_elements) {}
-
-  // May return negative if there were push_back calls after slice was filled.
-  int64 EndDistance() const { return end_ - current_; }
-
-  // Attempts to push value to the back of this. If the slice has
-  // already been filled, this method has no effect on the underlying data, but
-  // it changes the number returned by EndDistance into negative values.
-  void push_back(T&& value) {
-    if (EndDistance() > 0) *current_ = std::move(value);
-    ++current_;
-  }
-
-  // Returns a mutable reference to the last element in the slice.
-  // REQUIRES: size() > 0.
-  T& back() { return *(current_ - 1); }
-
-  // Returns the number of elements in the slice.
-  size_t size() const { return std::min(current_ - begin_, end_ - begin_); }
-
-  // Attempts to resize the vector to the given size. It does so by advancing
-  // the pointer to the current element, possibly beyond the end of the slice.
-  // As a consequence, calling `size()` after `resize(x)` was called might
-  // return a value less than `x`.
-  void resize(size_t size) { current_ = begin_ + size; }
-
-  // Returns the pointer to the underlying data buffer.
-  T* data() { return begin_; }
-
- private:
-  T* current_;
-  T* begin_;
-  T* end_;
-};
-
 void LogDenseFeatureDataLoss(StringPiece feature_name) {
   LOG(WARNING) << "Data loss! Feature '" << feature_name
                << "' is present in multiple concatenated "
diff --git a/tensorflow/core/util/mkl_types.h b/tensorflow/core/util/mkl_types.h
index 299ac14d7b7..e05bee3cc8a 100644
--- a/tensorflow/core/util/mkl_types.h
+++ b/tensorflow/core/util/mkl_types.h
@@ -118,7 +118,7 @@ namespace tensorflow {
 #define ADD_MD add_pd
 #define ALGORITHM mkldnn
 #define ALGORITHM_UNDEF ALGORITHM::algorithm_undef
-#define CPU_STREAM(engine) stream(stream::kind::eager)
+#define CPU_STREAM(engine) stream(stream::kind::eager_nostore)
 #define DATA_WITH_ENGINE(data, engine) data
 #define DST_MD dst_pd
 #define ENGINE_CPU engine::cpu
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 982eb969965..b5f6b2a705e 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -677,7 +677,7 @@ inline void ExecutePrimitive(const std::vector<primitive>& net,
   }
   cpu_stream.wait();
 #else
-  stream(stream::kind::eager).submit(net).wait();
+  stream(stream::kind::eager_nostore).submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
 }
 
@@ -1629,7 +1629,7 @@ class MklDnnData {
       reorder_memory_ = new memory(op_pd);
       std::vector<primitive> net;
       net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
-      stream(stream::kind::eager).submit(net).wait();
+      stream(stream::kind::eager_nostore).submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
       return true;
     }
@@ -1707,7 +1707,7 @@ class MklDnnData {
       std::vector<primitive> net;
       reorder_memory_ = new memory(op_pd, reorder_data_handle);
       net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
-      stream(stream::kind::eager).submit(net).wait();
+      stream(stream::kind::eager_nostore).submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
       return true;
     }
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 76fba798f02..454d733cb3b 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/core/util/tensor_bundle/byte_swap.h"
 #include "tensorflow/core/util/tensor_slice_util.h"
@@ -739,6 +740,7 @@ BundleReader::BundleReader(Env* env, StringPiece prefix)
       prefix_(prefix),
       metadata_(nullptr),
       table_(nullptr),
+      index_cache_(nullptr),
       iter_(nullptr),
       need_to_swap_bytes_(false) {
   const string filename = MetaFilename(prefix_);
@@ -751,7 +753,17 @@ BundleReader::BundleReader(Env* env, StringPiece prefix)
   status_ = env_->NewRandomAccessFile(filename, &wrapper);
   if (!status_.ok()) return;
   metadata_ = wrapper.release();
-  status_ = table::Table::Open(table::Options(), metadata_, file_size, &table_);
+
+  table::Options o;
+  int64 cache_size;
+  Status s =
+      ReadInt64FromEnvVar("TF_TABLE_INDEX_CACHE_SIZE_IN_MB", 0, &cache_size);
+  if (s.ok() && cache_size > 0) {
+    index_cache_ = table::NewLRUCache(cache_size << 20);
+    o.block_cache = index_cache_;
+  }
+
+  status_ = table::Table::Open(o, metadata_, file_size, &table_);
   if (!status_.ok()) return;
   iter_ = table_->NewIterator();
 
@@ -782,6 +794,9 @@ BundleReader::~BundleReader() {
   delete metadata_;
   delete iter_;
   delete table_;
+  if (index_cache_) {
+    delete index_cache_;
+  }
   // InputBuffer does not own the underlying RandomAccessFile.
   for (auto pair : data_) {
     if (pair.second != nullptr && pair.second->file() != nullptr) {
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 882a6a4e007..c441000e47d 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -61,8 +61,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
 
-#include "tensorflow/core/protobuf/tensor_bundle.pb.h"
-
 #include <map>
 #include <string>
 #include <unordered_map>
@@ -72,12 +70,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/io/cache.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/table.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/tensor_bundle.pb.h"
 #include "tensorflow/core/util/tensor_bundle/naming.h"
 #include "tensorflow/core/util/tensor_slice_set.h"
 
@@ -289,6 +289,7 @@ class BundleReader {
   Status status_;
   RandomAccessFile* metadata_;  // Owned.
   table::Table* table_;
+  table::Cache* index_cache_;
   table::Iterator* iter_;
   // Owned the InputBuffer objects and their underlying RandomAccessFile's.
   std::unordered_map<int32, io::InputBuffer*> data_;
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index ea8a7d89ed9..b4b07c368b4 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -48,7 +48,6 @@ py_binary(
         ":recognize_commands_py",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -172,8 +171,6 @@ py_library(
         ":input_data",
         ":models",
         "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -216,8 +213,6 @@ py_library(
         ":input_data",
         ":models",
         "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -261,7 +256,6 @@ py_library(
         ":models",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index 1cc7138cebf..4a48a440b6e 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -44,10 +44,10 @@ import sys
 
 import tensorflow as tf
 
-from tensorflow.python.ops import gen_audio_ops as audio_ops
 import input_data
 import models
 from tensorflow.python.framework import graph_util
+from tensorflow.python.ops import gen_audio_ops as audio_ops
 
 # If it's available, load the specialized feature generator. If this doesn't
 # work, try building with bazel instead of running the Python script directly.
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 2f104b9eaea..fbbbc6fdc50 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -233,15 +233,15 @@ class AudioProcessor(object):
         filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
       except:
         tf.compat.v1.logging.error(
-            'Failed to download URL: %s to folder: %s', data_url, filepath)
-        tf.compat.v1.logging.error(
-            'Please make sure you have enough free space and'
-            ' an internet connection')
+            'Failed to download URL: {0} to folder: {1}. Please make sure you '
+            'have enough free space and an internet connection'.format(
+                data_url, filepath))
         raise
       print()
       statinfo = os.stat(filepath)
-      tf.compat.v1.logging.info('Successfully downloaded %s (%d bytes)',
-                                filename, statinfo.st_size)
+      tf.compat.v1.logging.info(
+          'Successfully downloaded {0} ({1} bytes)'.format(
+              filename, statinfo.st_size))
       tarfile.open(filepath, 'r:gz').extractall(dest_directory)
 
   def prepare_data_index(self, silence_percentage, unknown_percentage,
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index 274d33b333f..ad20911a284 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 class InputDataTest(test.TestCase):
 
   def _getWavData(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       sample_data = tf.zeros([32000, 2])
       wav_encoder = tf.audio.encode_wav(sample_data, 16000)
       wav_data = self.evaluate(wav_encoder)
@@ -105,11 +105,11 @@ class InputDataTest(test.TestCase):
                                                 ["a", "b"], 10, 10,
                                                 self._model_settings(), tmp_dir)
     self.assertLess(0, audio_processor.set_size("training"))
-    self.assertTrue("training" in audio_processor.data_index)
-    self.assertTrue("validation" in audio_processor.data_index)
-    self.assertTrue("testing" in audio_processor.data_index)
-    self.assertEquals(input_data.UNKNOWN_WORD_INDEX,
-                      audio_processor.word_to_index["c"])
+    self.assertIn("training", audio_processor.data_index)
+    self.assertIn("validation", audio_processor.data_index)
+    self.assertIn("testing", audio_processor.data_index)
+    self.assertEqual(input_data.UNKNOWN_WORD_INDEX,
+                     audio_processor.word_to_index["c"])
 
   def testPrepareDataIndexEmpty(self):
     tmp_dir = self.get_temp_dir()
@@ -117,7 +117,7 @@ class InputDataTest(test.TestCase):
     with self.assertRaises(Exception) as e:
       _ = input_data.AudioProcessor("", tmp_dir, 10, 10, ["a", "b"], 10, 10,
                                     self._model_settings(), tmp_dir)
-    self.assertTrue("No .wavs found" in str(e.exception))
+    self.assertIn("No .wavs found", str(e.exception))
 
   def testPrepareDataIndexMissing(self):
     tmp_dir = self.get_temp_dir()
@@ -125,7 +125,7 @@ class InputDataTest(test.TestCase):
     with self.assertRaises(Exception) as e:
       _ = input_data.AudioProcessor("", tmp_dir, 10, 10, ["a", "b", "d"], 10,
                                     10, self._model_settings(), tmp_dir)
-    self.assertTrue("Expected to find" in str(e.exception))
+    self.assertIn("Expected to find", str(e.exception))
 
   @test_util.run_deprecated_v1
   def testPrepareBackgroundData(self):
diff --git a/tensorflow/examples/speech_commands/label_wav.py b/tensorflow/examples/speech_commands/label_wav.py
index 2a0190df616..3d8c3b67bf1 100644
--- a/tensorflow/examples/speech_commands/label_wav.py
+++ b/tensorflow/examples/speech_commands/label_wav.py
@@ -77,13 +77,12 @@ def run_graph(wav_data, labels, input_layer_name, output_layer_name,
 def label_wav(wav, labels, graph, input_name, output_name, how_many_labels):
   """Loads the model and labels, and runs the inference to print predictions."""
   if not wav or not tf.io.gfile.exists(wav):
-    tf.compat.v1.logging.fatal('Audio file does not exist %s', wav)
-
+    raise ValueError('Audio file does not exist at {0}'.format(wav))
   if not labels or not tf.io.gfile.exists(labels):
-    tf.compat.v1.logging.fatal('Labels file does not exist %s', labels)
+    raise ValueError('Labels file does not exist at {0}'.format(labels))
 
   if not graph or not tf.io.gfile.exists(graph):
-    tf.compat.v1.logging.fatal('Graph file does not exist %s', graph)
+    raise ValueError('Graph file does not exist at {0}'.format(graph))
 
   labels_list = load_labels(labels)
 
diff --git a/tensorflow/examples/speech_commands/label_wav_dir.py b/tensorflow/examples/speech_commands/label_wav_dir.py
index 313647b1ee7..d6016a06b62 100644
--- a/tensorflow/examples/speech_commands/label_wav_dir.py
+++ b/tensorflow/examples/speech_commands/label_wav_dir.py
@@ -64,8 +64,7 @@ def run_graph(wav_dir, labels, input_layer_name, output_layer_name,
     #   predictions per class
     for wav_path in glob.glob(wav_dir + '/*.wav'):
       if not wav_path or not tf.io.gfile.exists(wav_path):
-        tf.compat.v1.logging.fatal('Audio file does not exist %s', wav_path)
-
+        raise ValueError('Audio file does not exist at {0}'.format(wav_path))
       with open(wav_path, 'rb') as wav_file:
         wav_data = wav_file.read()
 
@@ -86,10 +85,10 @@ def run_graph(wav_dir, labels, input_layer_name, output_layer_name,
 def label_wav(wav_dir, labels, graph, input_name, output_name, how_many_labels):
   """Loads the model and labels, and runs the inference to print predictions."""
   if not labels or not tf.io.gfile.exists(labels):
-    tf.compat.v1.logging.fatal('Labels file does not exist %s', labels)
+    raise ValueError('Labels file does not exist at {0}'.format(labels))
 
   if not graph or not tf.io.gfile.exists(graph):
-    tf.compat.v1.logging.fatal('Graph file does not exist %s', graph)
+    raise ValueError('Graph file does not exist at {0}'.format(graph))
 
   labels_list = load_labels(labels)
 
diff --git a/tensorflow/examples/speech_commands/label_wav_test.py b/tensorflow/examples/speech_commands/label_wav_test.py
index 0e52d1b4388..67aec3dbece 100644
--- a/tensorflow/examples/speech_commands/label_wav_test.py
+++ b/tensorflow/examples/speech_commands/label_wav_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 class LabelWavTest(test.TestCase):
 
   def _getWavData(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       sample_data = tf.zeros([1000, 2])
       wav_encoder = tf.audio.encode_wav(sample_data, 16000)
       wav_data = self.evaluate(wav_encoder)
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index e917a51d837..bce5e521092 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -251,12 +251,16 @@ def main(_):
             dropout_rate: 0.5
         })
     train_writer.add_summary(train_summary, training_step)
-    tf.compat.v1.logging.info(
+    tf.compat.v1.logging.debug(
         'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
         (training_step, learning_rate_value, train_accuracy * 100,
          cross_entropy_value))
     is_last_step = (training_step == training_steps_max)
     if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
+      tf.compat.v1.logging.info(
+          'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
+          (training_step, learning_rate_value, train_accuracy * 100,
+           cross_entropy_value))
       set_size = audio_processor.set_size('validation')
       total_accuracy = 0
       total_conf_matrix = None
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
index dfe6c657c2f..f7d617f14f8 100644
--- a/tensorflow/examples/speech_commands/wav_to_features_test.py
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.platform import test
 class WavToFeaturesTest(test.TestCase):
 
   def _getWavData(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       sample_data = tf.zeros([32000, 2])
       wav_encoder = tf.audio.encode_wav(sample_data, 16000)
       wav_data = self.evaluate(wav_encoder)
@@ -63,7 +63,7 @@ class WavToFeaturesTest(test.TestCase):
                                     input_file_path, output_file_path)
     with open(output_file_path, "rb") as f:
       content = f.read()
-      self.assertTrue(b"const unsigned char g_input_data" in content)
+      self.assertIn(b"const unsigned char g_input_data", content)
 
   @test_util.run_deprecated_v1
   def testWavToFeaturesMicro(self):
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ecdce1e627b..449a95765a5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 0ce080d8bd5..9bc643ae6d2 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -94,22 +94,9 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	raw := tensorData(t.c)
 	buf := bytes.NewBuffer(raw[:0:len(raw)])
 	if dataType != String {
-		if isAllArray(val.Type()) {
-			// We have arrays all the way down, or just primitive types. We can
-			// just copy the memory in as it is all contiguous.
-			if err := copyPtr(buf, unpackEFace(value).data, int(val.Type().Size())); err != nil {
-				return nil, err
-			}
-		} else {
-			// When there are slices involved the memory for each leaf slice may
-			// not be contiguous with the others or in the order we might
-			// expect, so we need to work our way down to each slice of
-			// primitives and copy them individually
-			if err := encodeTensorWithSlices(buf, val, shape); err != nil {
-				return nil, err
-			}
+		if err := encodeTensor(buf, val, shape); err != nil {
+			return nil, err
 		}
-
 		if uintptr(buf.Len()) != nbytes {
 			return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
 		}
@@ -125,43 +112,6 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	return t, nil
 }
 
-// isAllArray returns true if type is a primitive type or an array of primitive
-// types or an array of ... etc.. When this is true the data we want is
-// contiguous in RAM.
-func isAllArray(typ reflect.Type) bool {
-	switch typ.Kind() {
-	case reflect.Slice:
-		return false
-	case reflect.Array:
-		return isAllArray(typ.Elem())
-	default:
-		// We know the type is slices/arrays of slices/arrays of primitive types.
-		return true
-	}
-}
-
-// eface defines what an interface type actually is: a pointer to type
-// information about the encapsulated type and a pointer to the encapsulated
-// value.
-type eface struct {
-	rtype unsafe.Pointer
-	data  unsafe.Pointer
-}
-
-// unpackEFace gives us an effient way to get us a pointer to the value carried
-// in an interface. If you wrap a pointer type in an interface then the pointer
-// is directly stored in the interface struct. If you wrap a value type in an
-// interface then the compiler copies the value into a newly allocated piece of
-// memory and stores a pointer to that memory in the interface. So we're
-// guaranteed to get a pointer. Go reflection doesn't expose the pointer to
-// value types straightforwardly as it doesn't want you to think you have a
-// reference to the original value. But we just want a pointer to make it
-// efficient to read the value, so cheating like this should be safe and
-// reasonable.
-func unpackEFace(obj interface{}) *eface {
-	return (*eface)(unsafe.Pointer(&obj))
-}
-
 // ReadTensor constructs a Tensor with the provided type and shape from the
 // serialized tensor contents in r.
 //
@@ -218,90 +168,23 @@ func (t *Tensor) Shape() []int64 { return t.shape }
 // Tensor(int64, 0): int64
 // Tensor(float64, 3): [][][]float64
 func (t *Tensor) Value() interface{} {
-	raw := tensorData(t.c)
-	shape := t.Shape()
-	dt := t.DataType()
-	if dt != String {
-		return decodeTensor(raw, shape, dt).Interface()
-	}
-
-	typ := typeOf(dt, shape)
+	typ := typeOf(t.DataType(), t.Shape())
 	val := reflect.New(typ)
-	nflattened := numElements(shape)
-	d := stringDecoder{offsets: bytes.NewReader(raw[0 : 8*nflattened]), data: raw[8*nflattened:], status: newStatus()}
-	if err := d.decode(val, shape); err != nil {
-		panic(bug("unable to decode String tensor with shape %v - %v", shape, err))
+	raw := tensorData(t.c)
+	if t.DataType() != String {
+		if err := decodeTensor(bytes.NewReader(raw), t.Shape(), typ, val); err != nil {
+			panic(bug("unable to decode Tensor of type %v and shape %v - %v", t.DataType(), t.Shape(), err))
+		}
+	} else {
+		nflattened := numElements(t.Shape())
+		d := stringDecoder{offsets: bytes.NewReader(raw[0 : 8*nflattened]), data: raw[8*nflattened:], status: newStatus()}
+		if err := d.decode(val, t.Shape()); err != nil {
+			panic(bug("unable to decode String tensor with shape %v - %v", t.Shape(), err))
+		}
 	}
 	return reflect.Indirect(val).Interface()
 }
 
-func decodeTensor(raw []byte, shape []int64, dt DataType) reflect.Value {
-	typ := typeForDataType(dt)
-	// Create a 1-dimensional slice of the base large enough for the data and
-	// copy the data in.
-	n := int(numElements(shape))
-	l := n * int(typ.Size())
-	typ = reflect.SliceOf(typ)
-	slice := reflect.MakeSlice(typ, n, n)
-	h := sliceHeader{
-		Data: unsafe.Pointer(slice.Pointer()),
-		Len:  l,
-		Cap:  l,
-	}
-	baseBytes := *(*[]byte)(unsafe.Pointer(&h))
-	copy(baseBytes, raw)
-	// Now we have the data in place in the base slice we can add the
-	// dimensions. We want to walk backwards through the shape. If the shape is
-	// length 1 or 0 then we're already done.
-	if len(shape) == 0 {
-		return slice.Index(0)
-	}
-	if len(shape) == 1 {
-		return slice
-	}
-	// We have a special case if the tensor has no data. Our backing slice is
-	// empty, but we still want to create slices following the shape. In this
-	// case only the final part of the shape will be 0 and we want to recalculate
-	// n at this point ignoring that 0.
-	// For example if our shape is 3 * 2 * 0 then n will be zero, but we still
-	// want 6 zero length slices to group as follows.
-	// {{} {}} {{} {}} {{} {}}
-	if n == 0 {
-		n = int(numElements(shape[:len(shape)-1]))
-	}
-	for i := len(shape) - 2; i >= 0; i-- {
-		underlyingSize := typ.Elem().Size()
-		typ = reflect.SliceOf(typ)
-		subsliceLen := int(shape[i+1])
-		if subsliceLen != 0 {
-			n = n / subsliceLen
-		}
-		// Just using reflection it is difficult to avoid unnecessary
-		// allocations while setting up the sub-slices as the Slice function on
-		// a slice Value allocates. So we end up doing pointer arithmetic!
-		// Pointer() on a slice gives us access to the data backing the slice.
-		// We insert slice headers directly into this data.
-		data := slice.Pointer()
-		nextSlice := reflect.MakeSlice(typ, n, n)
-		nextData := nextSlice.Pointer()
-		const sliceSize = unsafe.Sizeof(sliceHeader{})
-		for j := 0; j < n; j++ {
-			// This is equivalent to h := slice[j*subsliceLen: (j+1)*subsliceLen]
-			h := sliceHeader{
-				Data: unsafe.Pointer(data + (uintptr(j*subsliceLen) * underlyingSize)),
-				Len:  subsliceLen,
-				Cap:  subsliceLen,
-			}
-
-			// This is equivalent to nSlice[j] = h
-			*(*sliceHeader)(unsafe.Pointer(nextData + (uintptr(j) * sliceSize))) = h
-		}
-
-		slice = nextSlice
-	}
-	return slice
-}
-
 // WriteContentsTo writes the serialized contents of t to w.
 //
 // Returns the number of bytes written. See ReadTensor for
@@ -378,18 +261,18 @@ func shapeAndDataTypeOf(val reflect.Value) (shape []int64, dt DataType, err erro
 	return shape, dt, fmt.Errorf("unsupported type %v", typ)
 }
 
-func typeForDataType(dt DataType) reflect.Type {
-	for _, t := range types {
-		if dt == DataType(t.dataType) {
-			return t.typ
-		}
-	}
-	panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
-}
-
 // typeOf converts from a DataType and Shape to the equivalent Go type.
 func typeOf(dt DataType, shape []int64) reflect.Type {
-	ret := typeForDataType(dt)
+	var ret reflect.Type
+	for _, t := range types {
+		if dt == DataType(t.dataType) {
+			ret = t.typ
+			break
+		}
+	}
+	if ret == nil {
+		panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
+	}
 	for range shape {
 		ret = reflect.SliceOf(ret)
 	}
@@ -419,63 +302,92 @@ func byteSizeOfEncodedStrings(val interface{}) uintptr {
 	return size
 }
 
-// encodeTensorWithSlices writes v to the specified buffer using the format specified in
+// encodeTensor writes v to the specified buffer using the format specified in
 // c_api.h. Use stringEncoder for String tensors.
-func encodeTensorWithSlices(w *bytes.Buffer, v reflect.Value, shape []int64) error {
-	// If current dimension is a slice, verify that it has the expected size
-	// Go's type system makes that guarantee for arrays.
-	if v.Kind() == reflect.Slice {
-		expected := int(shape[0])
-		if v.Len() != expected {
-			return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
+func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
+	switch v.Kind() {
+	case reflect.Bool:
+		b := byte(0)
+		if v.Bool() {
+			b = 1
 		}
-	} else if v.Kind() != reflect.Array {
-		return fmt.Errorf("unsupported type %v", v.Type())
-	}
-
-	// Once we have just a single dimension we can just copy the data
-	if len(shape) == 1 && v.Len() > 0 {
-		elt := v.Index(0)
-		if !elt.CanAddr() {
-			panic("cannot take address")
-		}
-		ptr := unsafe.Pointer(elt.Addr().Pointer())
-		return copyPtr(w, ptr, v.Len()*int(elt.Type().Size()))
-	}
-
-	subShape := shape[1:]
-	for i := 0; i < v.Len(); i++ {
-		err := encodeTensorWithSlices(w, v.Index(i), subShape)
-		if err != nil {
+		if err := w.WriteByte(b); err != nil {
+			return err
+		}
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+		if err := binary.Write(w, nativeEndian, v.Interface()); err != nil {
 			return err
 		}
-	}
 
+	case reflect.Array, reflect.Slice:
+		// If current dimension is a slice, verify that it has the expected size
+		// Go's type system makes that guarantee for arrays.
+		if v.Kind() == reflect.Slice {
+			expected := int(shape[0])
+			if v.Len() != expected {
+				return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
+			}
+		}
+
+		// Optimisation: if only one dimension is left we can use binary.Write() directly for this slice
+		if len(shape) == 1 && v.Len() > 0 {
+			switch v.Index(0).Kind() {
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+				return binary.Write(w, nativeEndian, v.Interface())
+			}
+		}
+
+		subShape := shape[1:]
+		for i := 0; i < v.Len(); i++ {
+			err := encodeTensor(w, v.Index(i), subShape)
+			if err != nil {
+				return err
+			}
+		}
+
+	default:
+		return fmt.Errorf("unsupported type %v", v.Type())
+	}
 	return nil
 }
 
-// sliceHeader is a safer version of reflect.SliceHeader. Using unsafe.Pointer
-// for Data reduces potential issues with the GC. The reflect package uses a
-// similar struct internally.
-type sliceHeader struct {
-	Data unsafe.Pointer
-	Len  int
-	Cap  int
-}
+// decodeTensor decodes the Tensor from the buffer to ptr using the format
+// specified in c_api.h. Use stringDecoder for String tensors.
+func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.Value) error {
+	switch typ.Kind() {
+	case reflect.Bool:
+		b, err := r.ReadByte()
+		if err != nil {
+			return err
+		}
+		ptr.Elem().SetBool(b == 1)
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+		if err := binary.Read(r, nativeEndian, ptr.Interface()); err != nil {
+			return err
+		}
 
-// copyPtr copies the backing data for a slice or array directly into w. Note
-// we don't need to worry about byte ordering because we want the natural byte
-// order for the machine we're running on.
-func copyPtr(w *bytes.Buffer, ptr unsafe.Pointer, l int) error {
-	h := sliceHeader{
-		Data: ptr,
-		Len:  l,
-		Cap:  l,
+	case reflect.Slice:
+		val := reflect.Indirect(ptr)
+		val.Set(reflect.MakeSlice(typ, int(shape[0]), int(shape[0])))
+
+		// Optimization: if only one dimension is left we can use binary.Read() directly for this slice
+		if len(shape) == 1 && val.Len() > 0 {
+			switch val.Index(0).Kind() {
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+				return binary.Read(r, nativeEndian, val.Interface())
+			}
+		}
+
+		for i := 0; i < val.Len(); i++ {
+			if err := decodeTensor(r, shape[1:], typ.Elem(), val.Index(i).Addr()); err != nil {
+				return err
+			}
+		}
+
+	default:
+		return fmt.Errorf("unsupported type %v", typ)
 	}
-	// Convert our slice header into a []byte so we can call w.Write
-	b := *(*[]byte)(unsafe.Pointer(&h))
-	_, err := w.Write(b)
-	return err
+	return nil
 }
 
 type stringEncoder struct {
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 4d2df3a97dd..dc533cd3e1c 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -18,7 +18,6 @@ package tensorflow
 
 import (
 	"bytes"
-	"fmt"
 	"io"
 	"reflect"
 	"testing"
@@ -277,7 +276,6 @@ func TestReadTensorReadAll(t *testing.T) {
 }
 
 func benchmarkNewTensor(b *testing.B, v interface{}) {
-	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
 		if t, err := NewTensor(v); err != nil || t == nil {
 			b.Fatalf("(%v, %v)", t, err)
@@ -285,52 +283,32 @@ func benchmarkNewTensor(b *testing.B, v interface{}) {
 	}
 }
 
-func benchmarkValueTensor(b *testing.B, v interface{}) {
-	t, err := NewTensor(v)
-	if err != nil {
-		b.Fatalf("(%v, %v)", t, err)
-	}
-	b.ReportAllocs()
-	b.ResetTimer()
+func BenchmarkNewTensor(b *testing.B) {
+	var (
+		// Some sample sizes from the Inception image labeling model.
+		// Where input tensors correspond to a 224x224 RGB image
+		// flattened into a vector.
+		vector [224 * 224 * 3]int32
+	)
+	b.Run("[150528]", func(b *testing.B) { benchmarkNewTensor(b, vector) })
+}
 
+func benchmarkDecodeTensor(b *testing.B, t *Tensor) {
 	for i := 0; i < b.N; i++ {
 		_ = t.Value()
 	}
 }
 
-func BenchmarkTensor(b *testing.B) {
-	// Some sample sizes from the Inception image labeling model.
-	// Where input tensors correspond to a 224x224 RGB image
-	// flattened into a vector.
-	var vector [224 * 224 * 3]int32
-	var arrays [100][100][100]int32
-
-	l3 := make([][][]float32, 100)
-	l2 := make([][]float32, 100*100)
-	l1 := make([]float32, 100*100*100)
-	for i := range l2 {
-		l2[i] = l1[i*100 : (i+1)*100]
+func BenchmarkDecodeTensor(b *testing.B) {
+	var (
+		// Some sample sizes from the Inception image labeling model.
+		// Where input tensors correspond to a 224x224 RGB image
+		// flattened into a vector.
+		vector [224 * 224 * 3]int32
+	)
+	t, err := NewTensor(vector)
+	if err != nil {
+		b.Fatalf("(%v, %v)", t, err)
 	}
-	for i := range l3 {
-		l3[i] = l2[i*100 : (i+1)*100]
-	}
-
-	tests := []interface{}{
-		vector,
-		arrays,
-		l1,
-		l2,
-		l3,
-	}
-	b.Run("New", func(b *testing.B) {
-		for _, test := range tests {
-			b.Run(fmt.Sprintf("%T", test), func(b *testing.B) { benchmarkNewTensor(b, test) })
-		}
-	})
-	b.Run("Value", func(b *testing.B) {
-		for _, test := range tests {
-			b.Run(fmt.Sprintf("%T", test), func(b *testing.B) { benchmarkValueTensor(b, test) })
-		}
-	})
-
+	b.Run("[150528]", func(b *testing.B) { benchmarkDecodeTensor(b, t) })
 }
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 6a0a5a43b20..60a03cbe741 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_lite_protos")
 
 #
 # This is a TF Lite delegate that is powered by TensorFlow's Eager.
@@ -15,6 +15,7 @@ cc_library(
     name = "buffer_map",
     srcs = ["buffer_map.cc"],
     hdrs = ["buffer_map.h"],
+    copts = tf_opts_nortti_if_lite_protos(),
     deps = [
         ":util",
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/delegates/gpu/README.md b/tensorflow/lite/delegates/gpu/README.md
index 42d8e4b2caa..ee21ba27b95 100644
--- a/tensorflow/lite/delegates/gpu/README.md
+++ b/tensorflow/lite/delegates/gpu/README.md
@@ -30,6 +30,7 @@ TFLite on GPU supports the following ops in 16-bit and 32-bit float precision:
 * `CONCATENATION v1`
 * `CONV_2D v1`
 * `DEPTHWISE_CONV_2D v1-2`
+* `EXP v1`
 * `FULLY_CONNECTED v1`
 * `LOGISTIC v1`
 * `LSTM v2 (Basic LSTM only)`
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index d09e75e052d..2aeff13b3be 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -301,6 +301,7 @@ cc_library(
         ":model_hints",
         ":opencl_wrapper",
         ":precision",
+        ":storage_type_util",
         ":tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
@@ -387,6 +388,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "storage_type_util",
+    srcs = ["storage_type_util.cc"],
+    hdrs = ["storage_type_util.h"],
+    deps = [
+        ":cl_context",
+        ":cl_device",
+        ":tensor_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+    ],
+)
+
 cc_library(
     name = "tensor",
     srcs = ["tensor.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index a2a66cae0c9..6b0511fb267 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
@@ -109,64 +110,6 @@ void AddUsage(ValueId id, int task_index,
   }
 }
 
-TensorStorageType SelectBestStorageType(const CLContext& context,
-                                        const CLDevice& device,
-                                        const BHWC& shape,
-                                        const TensorStorageType& desired,
-                                        const DataType& data_type,
-                                        const Layout& layout) {
-  if (CanCreateTensorWithShape(context, device, shape,
-                               TensorDescriptor{data_type, desired, layout})) {
-    return desired;
-  }
-  auto GetBestTypeAfterTextureArray = [&]() {
-    if (device.SupportsImageBuffer() &&
-        CanCreateTensorWithShape(
-            context, device, shape,
-            TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER,
-                             layout})) {
-      return TensorStorageType::IMAGE_BUFFER;
-    } else {
-      return TensorStorageType::BUFFER;
-    }
-  };
-  auto GetBestTypeAfterTexture2D = [&]() {
-    if (device.SupportsTextureArray() &&
-        CanCreateTensorWithShape(
-            context, device, shape,
-            TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
-                             layout})) {
-      return TensorStorageType::TEXTURE_ARRAY;
-    } else {
-      return GetBestTypeAfterTextureArray();
-    }
-  };
-  auto GetBestTypeAfterTexture3D = [&]() {
-    if (CanCreateTensorWithShape(
-            context, device, shape,
-            TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D,
-                             layout})) {
-      return TensorStorageType::TEXTURE_2D;
-    } else {
-      return GetBestTypeAfterTexture2D();
-    }
-  };
-  switch (desired) {
-    case TensorStorageType::TEXTURE_2D:
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return GetBestTypeAfterTexture2D();
-    case TensorStorageType::TEXTURE_ARRAY:
-      return GetBestTypeAfterTextureArray();
-    case TensorStorageType::TEXTURE_3D:
-      return GetBestTypeAfterTexture3D();
-    case TensorStorageType::IMAGE_BUFFER:
-    case TensorStorageType::BUFFER:
-      return TensorStorageType::BUFFER;
-    default:
-      return TensorStorageType::BUFFER;
-  }
-}
-
 // returns true if actual memory for this storage type will be allocated with
 // clCreateBuffer.
 bool IsBufferBased(const TensorStorageType& type) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 4076213cd23..920bf9fd028 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -128,47 +128,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "conv_buffer",
-    srcs = ["conv_buffer.cc"],
-    hdrs = ["conv_buffer.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-    ],
-)
-
-cc_test(
-    name = "conv_buffer_test",
-    srcs = ["conv_buffer_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":cl_test",
-        ":conv_buffer",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "conv_buffer_1x1",
     srcs = ["conv_buffer_1x1.cc"],
@@ -1193,6 +1152,39 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "space_to_depth",
+    srcs = ["space_to_depth.cc"],
+    hdrs = ["space_to_depth.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "space_to_depth_test",
+    srcs = ["space_to_depth_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":space_to_depth",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "strided_slice",
     srcs = ["strided_slice.cc"],
@@ -1325,13 +1317,16 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -1375,7 +1370,6 @@ test_suite(
         "add_test",
         "concat_test",
         "conv_buffer_1x1_test",
-        "conv_buffer_test",
         "conv_constants_test",
         "conv_powervr_test",
         "conv_texture_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
deleted file mode 100644
index e2add78167f..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
-
-#include <string>
-#include <utility>
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-std::string GenerateConvBuffer(
-    const OperationDef& op_def, bool stride_correction, int x_elements,
-    int y_elements,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-    case CalculationsPrecision::F16:
-      c += "#define CONV(R, S)    \\\n";
-      c += "R += S.x * f0.s0123; \\\n";
-      c += "R += S.y * f0.s4567; \\\n";
-      c += "R += S.z * f0.s89ab; \\\n";
-      c += "R += S.w * f0.scdef;   \n";
-      break;
-    case CalculationsPrecision::F32_F16:
-      c += "#define CONV(R, S) \\\n";
-      c += "R += convert_float4(S.x * f0.s0123 + S.y * f0.s4567 + S.z * "
-           "f0.s89ab + S.w * f0.scdef);\n";
-      break;
-  }
-
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-      c += "#define FLT16 float16\n";
-      break;
-    case CalculationsPrecision::F32_F16:
-    case CalculationsPrecision::F16:
-      c += "#define FLT16 half16\n";
-      break;
-  }
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __global FLT16* filters_buffer,   \n";
-  c += "    __global FLT4* biases             \n";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size,                   \n";
-  c += "    int2 kernel_size,                \n";
-  c += "    int2 dilation,                   \n";
-  c += "    int2 stride,                     \n";
-  c += "    int2 padding                     \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0) * " + std::to_string(x_elements) + ";\n";
-  c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  __global FLT16* temp = filters_buffer + Z * src_size.z * "
-       "kernel_size.x * kernel_size.y;\n";
-  c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
-  for (int i = 0; i < x_elements * y_elements; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
-  }
-  for (int x = 0; x < x_elements; ++x) {
-    std::string x_s = std::to_string(x);
-    if (stride_correction) {
-      c += "  int xc" + x_s + " = " +
-           GetXStrideCorrected("X + " + x_s, "src_size.w", "stride.x",
-                               "padding.x") +
-           ";\n";
-    } else {
-      c += "  int xc" + x_s + " = (X + " + x_s + ") * stride.x + padding.x;\n";
-    }
-  }
-  for (int y = 0; y < y_elements; ++y) {
-    std::string y_s = std::to_string(y);
-    c += "  int yc" + y_s + " = (Y + " + y_s + ") * stride.y + padding.y;\n";
-  }
-  c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
-  for (int y = 0; y < y_elements; ++y) {
-    std::string y_s = std::to_string(y);
-    c += "  int c" + y_s + "y = y * dilation.y + yc" + y_s + ";\n";
-    c += "  bool y" + y_s + "_in = c" + y_s + "y >= 0 && c" + y_s +
-         "y < src_size.y;\n";
-    c += "  c" + y_s + "y = clamp(c" + y_s + "y, 0, src_size.y - 1);\n";
-  }
-  c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
-  for (int x = 0; x < x_elements; ++x) {
-    std::string x_s = std::to_string(x);
-    c += "  int c" + x_s + "x = x * dilation.x + xc" + x_s + ";\n";
-    c += "  bool x" + x_s + "_in = c" + x_s + "x >= 0 && c" + x_s +
-         "x < src_size.x;\n";
-    c += "  c" + x_s + "x = clamp(c" + x_s + "x, 0, src_size.x - 1);\n";
-  }
-  for (int x = 0; x < x_elements; ++x) {
-    std::string x_s = std::to_string(x);
-    for (int y = 0; y < y_elements; ++y) {
-      std::string y_s = std::to_string(y);
-      std::string i_s = std::to_string(y * x_elements + x);
-      c += "  int src_addr_" + i_s + " = c" + y_s + "y * src_size.x + c" + x_s +
-           "x;\n";
-    }
-  }
-  c += "  for (int s = 0; s < src_size.z; ++s) {\n";
-  for (int x = 0; x < x_elements; ++x) {
-    std::string x_s = std::to_string(x);
-    for (int y = 0; y < y_elements; ++y) {
-      std::string y_s = std::to_string(y);
-      std::string i_s = std::to_string(y * x_elements + x);
-      c += "    FLT4 s" + i_s + " = src_data[src_addr_" + i_s + "] * (FLT)(y" +
-           y_s + "_in && x" + x_s + "_in);\n";
-    }
-  }
-  c += "    FLT16 f0 = temp[0];\n";
-  for (int i = 0; i < x_elements * y_elements; ++i) {
-    std::string i_s = std::to_string(i);
-    c += "    CONV(r" + i_s + ", s" + i_s + ");\n";
-  }
-  for (int i = 0; i < x_elements * y_elements; ++i) {
-    std::string i_s = std::to_string(i);
-    c += "    src_addr_" + i_s + " += src_size.x * src_size.y;\n";
-  }
-  c += "    temp += 1;\n";
-  c += "  }\n";  // src_size.z - SRC_DEPTH
-  c += "  }\n";  // kernel_size.x
-  c += "  }\n";  // kernel_size.y
-
-  for (int x = 0; x < x_elements; ++x) {
-    std::string x_s = std::to_string(x);
-    for (int y = 0; y < y_elements; ++y) {
-      std::string y_s = std::to_string(y);
-      std::string i_s = std::to_string(y * x_elements + x);
-      c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
-           " < dst_size.y) {\n";
-      c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
-      const LinkingContext context{"res", "X + " + x_s, "Y + " + y_s, "Z"};
-      c += PostProcess(linked_operations, context);
-      c += "  " + dst_tensor.WriteWHS("res", "X + " + x_s, "Y + " + y_s, "Z") +
-           "\n";
-      c += "  }\n";
-    }
-  }
-  c += "}\n";
-  return c;
-}
-}  // namespace
-
-ConvBuffer::ConvBuffer(const OperationDef& definition,
-                       const Convolution2DAttributes& attr, int x_elements,
-                       int y_elements)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      x_elements_(x_elements),
-      y_elements_(y_elements),
-      work_group_size_(4, 4, 4) {}
-
-ConvBuffer::ConvBuffer(ConvBuffer&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_(std::move(operation.weights_)),
-      biases_(std::move(operation.biases_)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      x_elements_(operation.x_elements_),
-      y_elements_(operation.y_elements_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-ConvBuffer& ConvBuffer::operator=(ConvBuffer&& operation) {
-  if (this != &operation) {
-    weights_ = std::move(operation.weights_);
-    biases_ = std::move(operation.biases_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(x_elements_, operation.x_elements_);
-    std::swap(y_elements_, operation.y_elements_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-Status ConvBuffer::Compile(const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  const std::string code =
-      GenerateConvBuffer(definition_, stride_correction, x_elements_,
-                         y_elements_, linked_operations_);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-Status ConvBuffer::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  return OkStatus();
-}
-
-int3 ConvBuffer::GetGridSize() const {
-  const int grid_x =
-      IntegralDivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), x_elements_);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), y_elements_);
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-Status ConvBuffer::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
-                              &work_group_size_);
-}
-
-Status ConvBuffer::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-Status CreateConvBuffer(const CreationContext& creation_context,
-                        const OperationDef& definition,
-                        const Convolution2DAttributes& attr,
-                        ConvBuffer* result) {
-  int x_elements = 2;
-  int y_elements = 1;
-  if (definition.precision != CalculationsPrecision::F16) {
-    x_elements = 1;
-    y_elements = 1;
-  }
-  *result = ConvBuffer(definition, attr, x_elements, y_elements);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = definition.GetDataType();
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
-
-  return OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h
deleted file mode 100644
index 16197e13fd4..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class ConvBuffer : public GPUOperation {
- public:
-  ConvBuffer() = default;
-  Status AddToQueue(CLCommandQueue* queue) override;
-  Status Tune(const TuningParameters& params) override;
-
-  Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  ConvBuffer(ConvBuffer&& operation);
-  ConvBuffer& operator=(ConvBuffer&& operation);
-  ConvBuffer(const ConvBuffer&) = delete;
-  ConvBuffer& operator=(const ConvBuffer&) = delete;
-
- private:
-  friend Status CreateConvBuffer(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvBuffer* result);
-  ConvBuffer(const OperationDef& definition,
-             const Convolution2DAttributes& attr, int x_elements,
-             int y_elements);
-  template <DataType T>
-  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
-                       CLContext* context);
-
-  Status BindArguments();
-  int3 GetGridSize() const;
-
-  Buffer weights_;
-  LinearStorage biases_;
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-  int x_elements_;
-  int y_elements_;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
-};
-
-template <DataType T>
-Status ConvBuffer::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
-                                 CLContext* context) {
-  const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
-  const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
-
-  const int float4_size = definition_.precision == CalculationsPrecision::F32
-                              ? sizeof(float4)
-                              : sizeof(half4);
-
-  const int elements_count =
-      weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
-
-  if (definition_.GetDataType() == DataType::FLOAT32) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, /*out_group_size*/ 1,
-                                     absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, /*out_group_size*/ 1,
-                                     absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
-  }
-}
-
-Status CreateConvBuffer(const CreationContext& creation_context,
-                        const OperationDef& definition,
-                        const Convolution2DAttributes& attr,
-                        ConvBuffer* result);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 90fcf9fa338..0d7f1e7ed26 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -86,7 +86,7 @@ std::string GetShiftFromElementSize(int element_size) {
 
 std::string GenerateConvBuffer1x1(
     const OperationDef& op_def, int x_elements, int y_elements,
-    int element_size,
+    int element_size, bool different_weights_for_height,
     const std::vector<ElementwiseOperation*>& linked_operations) {
   std::string c = GetCommonDefines(op_def.precision);
   TensorCodeGenerator dst_tensor(
@@ -119,7 +119,12 @@ std::string GenerateConvBuffer1x1(
   c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
   c += "  int Z = get_global_id(2);\n";
   c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  __global FLT16* temp = filters_buffer + Z * src_size.z;\n";
+  if (different_weights_for_height) {
+    c += "  __global FLT16* temp = filters_buffer + (Z * src_size.y + Y) * "
+         "src_size.z;\n";
+  } else {
+    c += "  __global FLT16* temp = filters_buffer + Z * src_size.z;\n";
+  }
   c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
   for (int i = 0; i < x_elements * element_size * y_elements; ++i) {
     c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
@@ -192,14 +197,15 @@ int GetGridWidth(int width) {
 
 }  // namespace
 
-ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
-                             int flt4_x_count, int flt4_y_count,
-                             int flt8_x_count, int flt8_y_count)
+ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition, int flt4_x_count,
+                             int flt4_y_count, int flt8_x_count,
+                             int flt8_y_count)
     : GPUOperation(definition),
       flt4_x_count_(flt4_x_count),
       flt4_y_count_(flt4_y_count),
       flt8_x_count_(flt8_x_count),
       flt8_y_count_(flt8_y_count),
+      different_weights_for_height_(false),
       work_group_size_(2, 4, 1) {}
 
 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
@@ -212,6 +218,7 @@ ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
       kernel_flt8_(std::move(operation.kernel_flt8_)),
       flt8_x_count_(operation.flt8_x_count_),
       flt8_y_count_(operation.flt8_y_count_),
+      different_weights_for_height_(operation.different_weights_for_height_),
       work_group_size_(operation.work_group_size_) {}
 
 ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
@@ -224,6 +231,8 @@ ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
     kernel_flt8_ = std::move(operation.kernel_flt8_);
     std::swap(flt8_x_count_, operation.flt8_x_count_);
     std::swap(flt8_y_count_, operation.flt8_y_count_);
+    std::swap(different_weights_for_height_,
+              operation.different_weights_for_height_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -231,13 +240,15 @@ ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
 }
 
 Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
-  std::string code_flt4 = GenerateConvBuffer1x1(
-      definition_, flt4_x_count_, flt4_y_count_, 1, linked_operations_);
+  std::string code_flt4 =
+      GenerateConvBuffer1x1(definition_, flt4_x_count_, flt4_y_count_, 1,
+                            different_weights_for_height_, linked_operations_);
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code_flt4, "main_function", *creation_context.context,
       *creation_context.device, &kernel_flt4_));
-  std::string code_flt8 = GenerateConvBuffer1x1(
-      definition_, flt8_x_count_, flt8_y_count_, 2, linked_operations_);
+  std::string code_flt8 =
+      GenerateConvBuffer1x1(definition_, flt8_x_count_, flt8_y_count_, 2,
+                            different_weights_for_height_, linked_operations_);
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code_flt8, "main_function", *creation_context.context,
       *creation_context.device, &kernel_flt8_));
@@ -253,7 +264,7 @@ CLKernel* ConvBuffer1x1::GetKernel(int width) {
 }
 
 Status ConvBuffer1x1::BindArguments() {
-  CLKernel* kernel = GetKernel(src_[0]->Width());
+  CLKernel* kernel = GetKernel(dst_[0]->Width());
   kernel->ResetBindingCounter();
   RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[0]->GetMemoryPtr()));
   RETURN_IF_ERROR(kernel->SetMemoryAuto(weights_.GetMemoryPtr()));
@@ -297,7 +308,7 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
   auto src_storage_type = definition.src_tensors[0].storage_type;
   return src_storage_type == TensorStorageType::BUFFER &&
          attr.weights.shape.w == 1 && attr.weights.shape.h == 1 &&
-         attr.dilations.w == 1 && attr.dilations.w == 1 &&
+         attr.dilations.w == 1 && attr.dilations.h == 1 &&
          attr.strides.w == 1 && attr.strides.h == 1 &&
          attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
@@ -346,6 +357,17 @@ Status CreateConvBuffer1x1(const CreationContext& creation_context,
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
+Status CreateConvBuffer1x1Wino4x4To6x6(const CreationContext& creation_context,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr,
+                                       ConvBuffer1x1* result) {
+  *result = ConvBuffer1x1(definition, 4 /*flt4_x_count*/, 1 /*flt4_y_count*/,
+                          2 /*flt8_x_count*/, 1 /*flt8_y_count*/);
+  result->different_weights_for_height_ = true;
+  return result->UploadDataForWinograd4x4To6x6(
+      attr.weights, *creation_context.device, creation_context.context);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index 7f10bf802ab..f90439e082d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -61,12 +61,20 @@ class ConvBuffer1x1 : public GPUOperation {
                                     const OperationDef& definition,
                                     const FullyConnectedAttributes& attr,
                                     ConvBuffer1x1* result);
+  friend Status CreateConvBuffer1x1Wino4x4To6x6(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const Convolution2DAttributes& attr, ConvBuffer1x1* result);
 
   template <DataType T>
   Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                     const ::tflite::gpu::Tensor<Linear, T>& biases,
                     CLContext* context);
   template <DataType T>
+  Status UploadDataForWinograd4x4To6x6(
+      const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+      CLContext* context);
+
+  template <DataType T>
   Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                        CLContext* context);
 
@@ -86,6 +94,11 @@ class ConvBuffer1x1 : public GPUOperation {
   int flt8_x_count_;
   int flt8_y_count_;
 
+  // By default in 2d convolution we have the same weights for WH dims, but in
+  // some cases we need separate weights for H dimension and convolution kernel
+  // requires very small modifications to support it.
+  bool different_weights_for_height_;
+
   int3 work_group_size_;
 };
 
@@ -102,6 +115,24 @@ Status ConvBuffer1x1::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
   return OkStatus();
 }
 
+template <DataType T>
+Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+    CLContext* context) {
+  ::tflite::gpu::Tensor<OHWI, T> wino_weights;
+  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = definition_.GetDataType();
+  create_info.aligned_size = weights.shape.o;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape = Linear(weights.shape.o);
+  bias.data.resize(weights.shape.o, 0.0f);
+  return CreateLinearStorage(create_info, bias, context, &biases_);
+}
+
 template <DataType T>
 Status ConvBuffer1x1::UploadWeights(
     const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
@@ -143,6 +174,11 @@ Status CreateConvBuffer1x1(const CreationContext& creation_context,
                            const FullyConnectedAttributes& attr,
                            ConvBuffer1x1* result);
 
+Status CreateConvBuffer1x1Wino4x4To6x6(const CreationContext& creation_context,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr,
+                                       ConvBuffer1x1* result);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc
deleted file mode 100644
index 2289600497e..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_test.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-TEST_F(OpenCLOperationTest, ConvBufferSimpleWeights) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(1, 2, 2, 2);
-  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  attr.bias.shape = Linear(1);
-  attr.bias.data = {0.0f};
-
-  for (auto precision : env_.GetSupportedPrecisions()) {
-    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-    OperationDef op_def;
-    op_def.precision = precision;
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    op_def.src_tensors.push_back(
-        {data_type, TensorStorageType::BUFFER, Layout::HWC});
-    op_def.dst_tensors.push_back(
-        {data_type, TensorStorageType::BUFFER, Layout::HWC});
-    TensorFloat32 dst_tensor;
-    ConvBuffer operation;
-    ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
-    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                  BHWC(1, 2, 2, 1), &dst_tensor));
-    EXPECT_THAT(dst_tensor.data,
-                Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
-  }
-}
-
-TEST_F(OpenCLOperationTest, ConvBuffer) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(2, 2, 2, 2);
-  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  attr.bias.shape = Linear(2);
-  attr.bias.data = {0.5f, -0.5f};
-
-  for (auto precision : env_.GetSupportedPrecisions()) {
-    const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-    OperationDef op_def;
-    op_def.precision = precision;
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    op_def.src_tensors.push_back(
-        {data_type, TensorStorageType::BUFFER, Layout::HWC});
-    op_def.dst_tensors.push_back(
-        {data_type, TensorStorageType::BUFFER, Layout::HWC});
-    TensorFloat32 dst_tensor;
-    ConvBuffer operation;
-    ASSERT_OK(CreateConvBuffer(creation_context_, op_def, attr, &operation));
-    ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                  BHWC(1, 2, 2, 2), &dst_tensor));
-    EXPECT_THAT(dst_tensor.data,
-                Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f, 60.5f,
-                                           235.5f, 20.5f, 123.5f}));
-  }
-}
-
-}  // namespace
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index f31a2ddfe71..78e9795bc63 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -69,6 +69,63 @@ std::string GenerateAsyncUpload(const std::string& local_ptr_name,
        offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
   return c;
 }
+
+std::string GenerateBlockCoords(const int3& block_size,
+                                const int3& work_group_launch_order,
+                                bool linear_hw) {
+  std::string c;
+  int3 launch_remap;
+  launch_remap[work_group_launch_order.x] = 0;
+  launch_remap[work_group_launch_order.y] = 1;
+  launch_remap[work_group_launch_order.z] = 2;
+  if (linear_hw) {
+    if (work_group_launch_order[0] == 0) {
+      c += "  int linear_hw = get_global_id(0);\n";
+    } else {
+      c += "  int linear_hw = get_group_id(" + std::to_string(launch_remap[0]) +
+           ") * get_local_size(0) + get_local_id(0);\n";
+    }
+    c += "  int Y = (linear_hw / task_size_x) * " +
+         std::to_string(block_size.y) + ";\n";
+    c += "  int X = (linear_hw % task_size_x) * " +
+         std::to_string(block_size.x) + ";\n";
+    if (work_group_launch_order[1] == 1) {
+      c += "  int Z = get_global_id(1) * " + std::to_string(block_size.z) +
+           ";\n";
+    } else {
+      c += "  int Z = (get_group_id(" + std::to_string(launch_remap[1]) +
+           ") * get_local_size(1) + get_local_id(1)) * " +
+           std::to_string(block_size.z) + ";\n";
+    }
+  } else {
+    if (work_group_launch_order[0] == 0) {
+      c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) +
+           ";\n";
+    } else {
+      c += "  int X = (get_group_id(" + std::to_string(launch_remap[0]) +
+           ") * get_local_size(0) + get_local_id(0)) * " +
+           std::to_string(block_size.x) + ";\n";
+    }
+    if (work_group_launch_order[1] == 1) {
+      c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) +
+           ";\n";
+    } else {
+      c += "  int Y = (get_group_id(" + std::to_string(launch_remap[1]) +
+           ") * get_local_size(1) + get_local_id(1)) * " +
+           std::to_string(block_size.y) + ";\n";
+    }
+    if (work_group_launch_order[2] == 2) {
+      c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) +
+           ";\n";
+    } else {
+      c += "  int Z = (get_group_id(" + std::to_string(launch_remap[2]) +
+           ") * get_local_size(2) + get_local_id(2)) * " +
+           std::to_string(block_size.z) + ";\n";
+    }
+  }
+
+  return c;
+}
 }  // namespace
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
@@ -89,6 +146,11 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
       kernel_dilation_(1, 1, 1, 1),
       conv_params_(GuessBestParams(device, definition, attr)) {}
 
+ConvPowerVR::ConvPowerVR(const OperationDef& definition)
+    : GPUOperation(definition),
+      stride_padding_(1, 1, 0, 0),
+      kernel_dilation_(1, 1, 1, 1) {}
+
 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
     : GPUOperation(std::move(operation)),
       weights_(std::move(operation.weights_)),
@@ -141,6 +203,11 @@ Status ConvPowerVR::BindArguments() {
         int4(kernel_dilation_.x, kernel_dilation_.y,
              kernel_dilation_.z * src_[0]->Batch(), kernel_dilation_.w)));
   }
+  if (conv_params_.linear_hw) {
+    const int grid_x = IntegralDivideRoundUp(
+        dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
+    RETURN_IF_ERROR(kernel_.SetBytesAuto(grid_x));
+  }
   RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
   RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
   return OkStatus();
@@ -154,15 +221,27 @@ int3 ConvPowerVR::GetGridSize() const {
   const int grid_z =
       IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
   int3 wg;
-  wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
-  wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
-  wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
-  return int3(wg[conv_params_.work_group_launch_order[0]] *
-                  conv_params_.work_group_size.x,
-              wg[conv_params_.work_group_launch_order[1]] *
-                  conv_params_.work_group_size.y,
-              wg[conv_params_.work_group_launch_order[2]] *
-                  conv_params_.work_group_size.z);
+
+  if (conv_params_.linear_hw) {
+    wg.x =
+        IntegralDivideRoundUp(grid_x * grid_y, conv_params_.work_group_size.x);
+    wg.y = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.y);
+    return int3(wg[conv_params_.work_group_launch_order[0]] *
+                    conv_params_.work_group_size.x,
+                wg[conv_params_.work_group_launch_order[1]] *
+                    conv_params_.work_group_size.y,
+                1);
+  } else {
+    wg.x = IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
+    wg.y = IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
+    wg.z = IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+    return int3(wg[conv_params_.work_group_launch_order[0]] *
+                    conv_params_.work_group_size.x,
+                wg[conv_params_.work_group_launch_order[1]] *
+                    conv_params_.work_group_size.y,
+                wg[conv_params_.work_group_launch_order[2]] *
+                    conv_params_.work_group_size.z);
+  }
 }
 
 Status ConvPowerVR::Tune(const TuningParameters& params) {
@@ -219,6 +298,12 @@ std::string GenerateConvPowerVR1x1(
           ? "__constant"
           : "__global";
 
+  const std::string weights_data_type =
+      conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";
+
+  const std::string weights_global_ptr =
+      weights_space + " " + weights_data_type + "*";
+
   const int3 work_group_size = conv_params.work_group_size;
   const int3 block_size = conv_params.block_size;
   if (conv_params.fixed_work_group_size) {
@@ -229,41 +314,30 @@ std::string GenerateConvPowerVR1x1(
   }
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    " + weights_space + " ACCUM_FLT4* filters_buffer,    \n";
-  c += "    " + weights_space + " ACCUM_FLT4* biases             \n";
+  c += "    " + weights_global_ptr + " filters_buffer,    \n";
+  c += "    " + weights_global_ptr + " biases             \n";
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   if (!is1x1) {
     c += "    int4 stride_padding,           \n";
     c += "    int4 kernel_dilation,          \n";
   }
+  if (conv_params.linear_hw) {
+    c += "    int task_size_x,               \n";
+  }
   c += "    int4 src_size,                   \n";
   c += "    int4 dst_size                    \n";
   c += ") {\n";
-  int3 launch_remap;
-  launch_remap[conv_params.work_group_launch_order.x] = 0;
-  launch_remap[conv_params.work_group_launch_order.y] = 1;
-  launch_remap[conv_params.work_group_launch_order.z] = 2;
-  if (conv_params.work_group_launch_order[0] == 0) {
-    c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
-  } else {
-    c += "  int X = (get_group_id(" + std::to_string(launch_remap[0]) +
-         ") * get_local_size(0) + get_local_id(0)) * " +
-         std::to_string(block_size.x) + ";\n";
+  c += GenerateBlockCoords(conv_params.block_size,
+                           conv_params.work_group_launch_order,
+                           conv_params.linear_hw);
+  std::vector<std::string> dst_x(conv_params.block_size.x);
+  for (int x = 0; x < conv_params.block_size.x; ++x) {
+    dst_x[x] = "(X + " + std::to_string(x) + ")";
   }
-  if (conv_params.work_group_launch_order[1] == 1) {
-    c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
-  } else {
-    c += "  int Y = (get_group_id(" + std::to_string(launch_remap[1]) +
-         ") * get_local_size(1) + get_local_id(1)) * " +
-         std::to_string(block_size.y) + ";\n";
-  }
-  if (conv_params.work_group_launch_order[2] == 2) {
-    c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  } else {
-    c += "  int Z = (get_group_id(" + std::to_string(launch_remap[2]) +
-         ") * get_local_size(2) + get_local_id(2)) * " +
-         std::to_string(block_size.z) + ";\n";
+  std::vector<std::string> dst_y(conv_params.block_size.y);
+  for (int y = 0; y < conv_params.block_size.y; ++y) {
+    dst_y[y] = "(Y + " + std::to_string(y) + ")";
   }
   if (!need_local_mem) {
     c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) {\n";
@@ -272,8 +346,12 @@ std::string GenerateConvPowerVR1x1(
   }
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "  int lid = get_local_id(1) * " + std::to_string(work_group_size.x) +
-         " + get_local_id(0);\n";
+    if (conv_params.linear_hw) {
+      c += "  int lid = get_local_id(0);\n";
+    } else {
+      c += "  int lid = get_local_id(1) * " +
+           std::to_string(work_group_size.x) + " + get_local_id(0);\n";
+    }
   }
   for (int z = 0; z < block_size.z; ++z) {
     for (int y = 0; y < block_size.y; ++y) {
@@ -285,25 +363,23 @@ std::string GenerateConvPowerVR1x1(
   }
   if (!is1x1) {
     for (int x = 0; x < block_size.x; ++x) {
-      const std::string xc = "(X + " + std::to_string(x) + ")";
       if (stride_correction) {
         c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(xc, "src_size.w", "stride_padding.x",
+             GetXStrideCorrected(dst_x[x], "src_size.w", "stride_padding.x",
                                  "stride_padding.z") +
              ";\n";
       } else {
-        c += "  int xc" + std::to_string(x) + " = " + xc +
+        c += "  int xc" + std::to_string(x) + " = " + dst_x[x] +
              " * stride_padding.x + stride_padding.z;\n";
       }
     }
     for (int y = 0; y < block_size.y; ++y) {
-      const std::string yc = "(Y + " + std::to_string(y) + ")";
-      c += "  int yc" + std::to_string(y) + " = " + yc +
+      c += "  int yc" + std::to_string(y) + " = " + dst_y[y] +
            " * stride_padding.y + stride_padding.w;\n";
     }
   }
   if (need_local_mem) {
-    c += "  __local ACCUM_FLT4 weights_cache[" +
+    c += "  __local " + weights_data_type + " weights_cache[" +
          std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
          "];\n";
   }
@@ -311,15 +387,23 @@ std::string GenerateConvPowerVR1x1(
           ConvPowerVR::WeightsUploadType::GLOBAL_MEM ||
       conv_params.weights_upload_type ==
           ConvPowerVR::WeightsUploadType::CONSTANT_MEM) {
-    c += "    " + weights_space + " ACCUM_FLT4* weights_cache;\n";
+    c += "    " + weights_global_ptr + " weights_cache;\n";
   }
   if (is1x1) {
-    c += "  " + weights_space +
-         " ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
-         "src_size.z;\n";
+    if (conv_params.different_weights_for_height) {
+      c += "  " + weights_global_ptr +
+           " filters_loc = filters_buffer + (Z * src_size.y + Y * " +
+           std::to_string(block_size.z) +
+           ") * "
+           "4 * src_size.z;\n";
+    } else {
+      c += "  " + weights_global_ptr +
+           " filters_loc = filters_buffer + Z * 4 * "
+           "src_size.z;\n";
+    }
   } else {
-    c += "  " + weights_space +
-         " ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
+    c += "  " + weights_global_ptr +
+         " filters_loc = filters_buffer + Z * 4 * "
          "src_size.z * kernel_dilation.x * kernel_dilation.y;\n";
   }
   if (buffer_type) {
@@ -354,10 +438,8 @@ std::string GenerateConvPowerVR1x1(
       const std::string yck = "yck" + std::to_string(y);
       for (int x = 0; x < block_size.x; ++x) {
         const std::string xck = "xck" + std::to_string(x);
-        std::string xc =
-            is1x1 ? "min(X + " + std::to_string(x) + ", src_size.x - 1)" : xck;
-        std::string yc =
-            is1x1 ? "min(Y + " + std::to_string(y) + ", src_size.y - 1)" : yck;
+        std::string xc = is1x1 ? "min(" + dst_x[x] + ", src_size.x - 1)" : xck;
+        std::string yc = is1x1 ? "min(" + dst_y[y] + ", src_size.y - 1)" : yck;
         std::string id = std::to_string(y) + std::to_string(x);
         c += "  int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
       }
@@ -368,11 +450,7 @@ std::string GenerateConvPowerVR1x1(
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         const std::string id = std::to_string(y) + std::to_string(x);
-        if (op_def.precision == CalculationsPrecision::F32_F16) {
-          c += "    ACCUM_FLT4 src" + id + ";\n";
-        } else {
-          c += "    FLT4 src" + id + ";\n";
-        }
+        c += "    " + weights_data_type + " src" + id + ";\n";
       }
     }
   };
@@ -386,52 +464,54 @@ std::string GenerateConvPowerVR1x1(
                                        ? ""
                                        : " * (FLT)(mx" + std::to_string(x) +
                                              " && my" + std::to_string(y) + ")";
-          if (src_tensor_type == TensorStorageType::BUFFER) {
-            if (op_def.precision == CalculationsPrecision::F32_F16) {
-              c += "    src" + id + " = convert_float4(src_data[src_a_" + id +
-                   "]" + multiplier + ");\n";
-            } else {
-              c += "    src" + id + " = src_data[src_a_" + id + "]" +
-                   multiplier + ";\n";
-            }
-          }
-          if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-            if (op_def.precision == CalculationsPrecision::F32_F16) {
-              c += "    src" + id + " = " +
-                   src_tensor.ReadAsFloat("src_a_" + id) + multiplier + ";\n";
-            } else {
-              c += "    src" + id + " = " + src_tensor.Read("src_a_" + id) +
-                   multiplier + ";\n";
-            }
-          }
+          c += "    src" + id + " = " +
+               src_tensor.ReadAsType(conv_params.weights_data_type,
+                                     "src_a_" + id) +
+               multiplier + ";\n";
           c += "    src_a_" + id + " += src_layer_offset;\n";
         } else {
           std::string id = std::to_string(y) + std::to_string(x);
-          const std::string xc =
-              is1x1 ? "X + " + std::to_string(x) : "xck" + std::to_string(x);
-          const std::string yc =
-              is1x1 ? "Y + " + std::to_string(y) : "yck" + std::to_string(y);
-          if (op_def.precision == CalculationsPrecision::F32_F16) {
-            c += "    src" + id + " = " +
-                 src_tensor.ReadAsFloatWHS(xc, yc, "s", mode) + ";\n";
-          } else {
-            c += "    src" + id + " = " +
-                 src_tensor.ReadWHS(xc, yc, "s", mode) + ";\n";
-          }
+          const std::string xc = is1x1 ? dst_x[x] : "xck" + std::to_string(x);
+          const std::string yc = is1x1 ? dst_y[y] : "yck" + std::to_string(y);
+          c += "    src" + id + " = " +
+               src_tensor.ReadAsTypeWHS(conv_params.weights_data_type, xc, yc,
+                                        "s", mode) +
+               ";\n";
         }
       }
     }
   };
+  const bool weights_type_as_accum_type =
+      !(op_def.precision == CalculationsPrecision::F32_F16 &&
+        conv_params.weights_data_type == DataType::FLOAT16);
   auto conv_core = [&](int shared_offset) {
     const std::string channels[] = {"x", "y", "z", "w"};
     for (int z = 0; z < block_size.z; ++z) {
-      for (int ch = 0; ch < 4; ++ch) {
+      if (weights_type_as_accum_type) {
+        for (int ch = 0; ch < 4; ++ch) {
+          for (int y = 0; y < block_size.y; ++y) {
+            for (int x = 0; x < block_size.x; ++x) {
+              std::string id = std::to_string(y) + std::to_string(x);
+              c += "    r" + std::to_string(z) + id + " += weights_cache[" +
+                   std::to_string(z * 4 + ch + shared_offset) + "] * src" + id +
+                   "." + channels[ch] + ";\n";
+            }
+          }
+        }
+      } else {  // F32_F16 precision and weights type is float16
         for (int y = 0; y < block_size.y; ++y) {
           for (int x = 0; x < block_size.x; ++x) {
             std::string id = std::to_string(y) + std::to_string(x);
-            c += "    r" + std::to_string(z) + id + " += weights_cache[" +
-                 std::to_string(z * 4 + ch + shared_offset) + "] * src" + id +
-                 "." + channels[ch] + ";\n";
+            std::string R = "r" + std::to_string(z) + id;
+            std::string S = "src" + id;
+            const int dz = z * 4 + shared_offset;
+            std::string f0 = "weights_cache[" + std::to_string(dz + 0) + "]";
+            std::string f1 = "weights_cache[" + std::to_string(dz + 1) + "]";
+            std::string f2 = "weights_cache[" + std::to_string(dz + 2) + "]";
+            std::string f3 = "weights_cache[" + std::to_string(dz + 3) + "]";
+            c += "    " + R + " += convert_float4(" + S + ".x * " + f0 + " + " +
+                 S + ".y * " + f1 + " + " + S + ".z * " + f2 + " + " + S +
+                 ".w * " + f3 + ");\n";
           }
         }
       }
@@ -497,14 +577,16 @@ std::string GenerateConvPowerVR1x1(
     c += "  }\n";
   }
   for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z + " + std::to_string(z) + " >= dst_size.z) return;\n";
+    const std::string sz = std::to_string(z);
+    c += "  if (Z + " + sz + " >= dst_size.z) return;\n";
+    c += "  {\n";
+    c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sz + "]);\n";
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
-        const std::string xs = "X + " + std::to_string(x);
-        const std::string ys = "Y + " + std::to_string(y);
-        const std::string zs = "Z + " + std::to_string(z);
-        const std::string r_id =
-            std::to_string(z) + std::to_string(y) + std::to_string(x);
+        const std::string xs = dst_x[x];
+        const std::string ys = dst_y[y];
+        const std::string zs = "Z + " + sz;
+        const std::string r_id = sz + std::to_string(y) + std::to_string(x);
         bool need_x_check = x != 0;
         bool need_y_check = y != 0;
         if (need_x_check && need_y_check) {
@@ -516,14 +598,14 @@ std::string GenerateConvPowerVR1x1(
         } else {
           c += "  {\n";
         }
-        c += "    FLT4 res = TO_FLT4(r" + r_id + " + weights_cache[" +
-             std::to_string(z) + "]);\n";
+        c += "    FLT4 res = TO_FLT4(r" + r_id + ") + bias_val;\n";
         const LinkingContext context{"res", xs, ys, zs};
         c += PostProcess(linked_operations, context);
         c += "    " + dst_tensor.WriteWHS("res", xs, ys, zs) + "\n";
         c += "  }\n";
       }
     }
+    c += "  }\n";
   }
   c += "}\n";
   return c;
@@ -531,15 +613,27 @@ std::string GenerateConvPowerVR1x1(
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition, int src_depth,
-    int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1) const {
+    int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
+    bool different_weights_for_height) const {
   ConvParams conv_params;
+  conv_params.linear_hw = false;
+  conv_params.weights_data_type =
+      DeduceDataTypeFromPrecision(definition.precision);
   conv_params.x_kernel_is_1 = x_kernel_is_1;
   conv_params.y_kernel_is_1 = y_kernel_is_1;
+  conv_params.different_weights_for_height = different_weights_for_height;
   if (device.IsNvidia()) {
+    if (different_weights_for_height) {
+      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.work_group_launch_order = int3(2, 0, 1);
+      conv_params.fixed_work_group_size = true;
+    } else {
+      conv_params.linear_hw = true;
+      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.work_group_launch_order = int3(1, 0, 2);
+      conv_params.fixed_work_group_size = true;
+    }
     conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.fixed_work_group_size = true;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
@@ -556,10 +650,20 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       conv_params.src_depth_loop_size = 4;
     }
   } else if (device.IsPowerVR()) {
+    if (different_weights_for_height) {
+      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.work_group_launch_order = int3(2, 0, 1);
+      conv_params.fixed_work_group_size = true;
+    } else {
+      conv_params.linear_hw = true;
+      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.work_group_launch_order = int3(1, 0, 2);
+      conv_params.fixed_work_group_size = true;
+    }
+    conv_params.weights_data_type =
+        definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
+                                                           : DataType::FLOAT32;
     conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.fixed_work_group_size = true;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type =
         WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
@@ -592,16 +696,22 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
         }
       }
       conv_params.block_size.x = 2;
-      conv_params.work_group_size = int3(4, 8, 1);
     }
   } else if (device.IsAMD()) {
+    if (different_weights_for_height) {
+      conv_params.work_group_size = int3(32, 1, 1);
+      conv_params.work_group_launch_order = int3(2, 0, 1);
+      conv_params.fixed_work_group_size = true;
+    } else {
+      conv_params.work_group_size = int3(8, 4, 1);
+      conv_params.work_group_launch_order = int3(2, 0, 1);
+      conv_params.fixed_work_group_size = true;
+    }
+
     conv_params.block_size = int3(2, 1, 1);
     if (x_kernel_is_1 && y_kernel_is_1) {
       conv_params.block_size.y = 2;
     }
-    conv_params.work_group_size = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.fixed_work_group_size = true;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
@@ -616,9 +726,21 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     if (src_depth % 2 == 0 && src_depth >= 16) {
       conv_params.src_depth_loop_size = 2;
     }
+  } else if (device.IsMali()) {
+    conv_params.block_size = int3(2, 1, 1);
+    conv_params.work_group_size = int3(4, 4, 1);
+    conv_params.work_group_launch_order = int3(0, 1, 2);
+    conv_params.fixed_work_group_size = false;
+    conv_params.src_depth_loop_size = 1;
+    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+    if (dst_depth % 2 == 0 || dst_depth >= 4) {
+      conv_params.block_size.z = 2;
+    } else {
+      conv_params.block_size.z = 1;
+    }
   } else {
     conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 4, 1);
+    conv_params.work_group_size = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
@@ -655,7 +777,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
                              attr.padding.prepended.h == 0 &&
                              attr.padding.appended.h == 0;
   return GuessBestParams(device, definition, src_depth, dst_depth,
-                         x_kernel_is_1, y_kernel_is_1);
+                         x_kernel_is_1, y_kernel_is_1, false);
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
@@ -663,9 +785,24 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const FullyConnectedAttributes& attr) const {
   const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
-  ConvPowerVR::ConvParams params =
-      GuessBestParams(device, definition, src_depth, dst_depth, true, true);
-  params.work_group_size = int3(32, 1, 1);
+  ConvPowerVR::ConvParams params = GuessBestParams(
+      device, definition, src_depth, dst_depth, true, true, false);
+  params.work_group_size.x *= params.work_group_size.y;
+  params.work_group_size.y = 1;
+  params.block_size.x *= params.block_size.y;
+  params.block_size.y = 1;
+  return params;
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
+    const CLDevice& device, const OperationDef& definition,
+    const Convolution2DAttributes& attr) const {
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  ConvPowerVR::ConvParams params = GuessBestParams(
+      device, definition, src_depth, dst_depth, true, true, true);
+  params.block_size.x *= params.block_size.y;
+  params.block_size.y = 1;
   return params;
 }
 
@@ -685,6 +822,17 @@ Status CreateConvPowerVR(const CreationContext& creation_context,
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
+Status CreateConvPowerVRWino4x4To6x6(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr,
+                                     ConvPowerVR* result) {
+  *result = ConvPowerVR(definition);
+  result->conv_params_ = result->GuessBestParamsWinograd(
+      *creation_context.device, definition, attr);
+  return result->UploadDataForWinograd4x4To6x6(
+      attr.weights, *creation_context.device, creation_context.context);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index ed66a3b0dad..110b983940a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -58,10 +58,20 @@ class ConvPowerVR : public GPUOperation {
   };
 
   struct ConvParams {
+    // Usually we use this combinations for CalculationPrecision:
+    // F32: all F32
+    // F16: all F16
+    // F32_F16: all besides accumulator is F16, including weights
+    // But for PowerVR we can achieve better performance in F32_F16 with F32
+    // weights, so for PowerVR in this kernel we have F32 weights for
+    // F32_F16 precision mode
+    DataType weights_data_type;  // used for weights and biases
     int3 block_size;
     int3 work_group_size;
     int3 work_group_launch_order;
     bool fixed_work_group_size;
+    bool linear_hw;
+    bool different_weights_for_height;
     int src_depth_loop_size;
     WeightsUploadType weights_upload_type;
     bool x_kernel_is_1;
@@ -72,12 +82,18 @@ class ConvPowerVR : public GPUOperation {
               const Convolution2DAttributes& attr, const CLDevice& device);
   ConvPowerVR(const OperationDef& definition,
               const FullyConnectedAttributes& attr, const CLDevice& device);
+  explicit ConvPowerVR(const OperationDef& definition);
 
   template <DataType T>
   Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                     const ::tflite::gpu::Tensor<Linear, T>& biases,
                     CLContext* context);
   template <DataType T>
+  Status UploadDataForWinograd4x4To6x6(
+      const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+      CLContext* context);
+
+  template <DataType T>
   Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                        CLContext* context);
 
@@ -91,6 +107,10 @@ class ConvPowerVR : public GPUOperation {
                                   const FullyConnectedAttributes& attr,
                                   ConvPowerVR* result);
 
+  friend Status CreateConvPowerVRWino4x4To6x6(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const Convolution2DAttributes& attr, ConvPowerVR* result);
+
   friend std::string GenerateConvPowerVR1x1(
       const OperationDef& op_def, bool stride_correction,
       const ConvParams& conv_params,
@@ -102,10 +122,14 @@ class ConvPowerVR : public GPUOperation {
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
                              const FullyConnectedAttributes& attr) const;
+  ConvParams GuessBestParamsWinograd(const CLDevice& device,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr) const;
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition, int src_depth,
                              int dst_depth, bool x_kernel_is_1,
-                             bool y_kernel_is_1) const;
+                             bool y_kernel_is_1,
+                             bool different_weights_for_height) const;
 
   Status BindArguments();
   int3 GetGridSize() const;
@@ -127,21 +151,37 @@ Status ConvPowerVR::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
   RETURN_IF_ERROR(UploadWeights(weights, context));
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = definition_.precision == CalculationsPrecision::F16
-                              ? DataType::FLOAT16
-                              : DataType::FLOAT32;
+  create_info.data_type = conv_params_.weights_data_type;
   create_info.aligned_size = weights.shape.o;
   RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
   return OkStatus();
 }
 
+template <DataType T>
+Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+    CLContext* context) {
+  ::tflite::gpu::Tensor<OHWI, T> wino_weights;
+  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::BUFFER;
+  create_info.data_type = conv_params_.weights_data_type;
+  create_info.aligned_size = weights.shape.o;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape = Linear(weights.shape.o);
+  bias.data.resize(weights.shape.o, 0.0f);
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, bias, context, &biases_));
+  return OkStatus();
+}
+
 template <DataType T>
 Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                                   CLContext* context) {
   const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
 
-  const bool f32_weights = definition_.precision != CalculationsPrecision::F16;
+  const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
   const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
   const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
@@ -173,6 +213,11 @@ Status CreateConvPowerVR(const CreationContext& creation_context,
                          const FullyConnectedAttributes& attr,
                          ConvPowerVR* result);
 
+Status CreateConvPowerVRWino4x4To6x6(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr,
+                                     ConvPowerVR* result);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 4eefb3de52a..780d6646ea8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -32,7 +32,8 @@ namespace cl {
 namespace {
 std::string GenerateConvCode(
     const OperationDef& op_def, const int3& block_size, bool is1x1,
-    bool adreno4xx_optimization, bool stride_correction, const CLDevice& device,
+    bool adreno4xx_optimization, bool stride_correction,
+    bool different_weights_for_height, const CLDevice& device,
     const std::vector<ElementwiseOperation*>& linked_operations) {
   std::string c = GetCommonDefines(op_def.precision);
   TensorCodeGenerator src_tensor(
@@ -128,6 +129,9 @@ std::string GenerateConvCode(
          " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
   std::string f_y = is1x1 ? "s" : "filter_offset";
+  if (different_weights_for_height) {
+    f_y = "Y * src_size.z + s";
+  }
   if (!is1x1) {
     for (int x = 0; x < block_size.x; ++x) {
       c += "  int cx" + xs[x] + ";\n";
@@ -329,16 +333,17 @@ ConvTexture::ConvTexture(const OperationDef& definition,
       stride_(attr.strides.w, attr.strides.h),
       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
       dilation_(attr.dilations.w, attr.dilations.h),
+      different_weights_for_height_(false),
       block_size_(2, 2, 2),
       work_group_size_(4, 4, 2) {}
 
-ConvTexture::ConvTexture(const OperationDef& definition,
-                         const FullyConnectedAttributes& attr)
+ConvTexture::ConvTexture(const OperationDef& definition)
     : GPUOperation(definition),
       kernel_size_(1, 1),
       stride_(1, 1),
       padding_(0, 0),
       dilation_(1, 1),
+      different_weights_for_height_(false),
       block_size_(4, 1, 2),
       work_group_size_(16, 1, 2) {}
 
@@ -353,6 +358,7 @@ ConvTexture::ConvTexture(ConvTexture&& operation)
       stride_(operation.stride_),
       padding_(operation.padding_),
       dilation_(operation.dilation_),
+      different_weights_for_height_(operation.different_weights_for_height_),
       block_size_(operation.block_size_),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
@@ -368,6 +374,8 @@ ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
     std::swap(dilation_, operation.dilation_);
+    std::swap(different_weights_for_height_,
+              operation.different_weights_for_height_);
     std::swap(block_size_, operation.block_size_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
@@ -386,9 +394,10 @@ Status ConvTexture::Compile(const CreationContext& creation_context) {
       definition_.precision == CalculationsPrecision::F16;
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const std::string code = GenerateConvCode(
-      definition_, block_size_, is1x1, adreno4xx_optimization,
-      stride_correction, *creation_context.device, linked_operations_);
+  const std::string code =
+      GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
+                       stride_correction, different_weights_for_height_,
+                       *creation_context.device, linked_operations_);
   std::vector<CompilerOptions> options;
   if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
     options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@@ -452,10 +461,21 @@ Status CreateConvTexture(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
                          ConvTexture* result) {
-  *result = ConvTexture(definition, attr);
+  *result = ConvTexture(definition);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
+Status CreateConvTextureWino4x4To6x6(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr,
+                                     ConvTexture* result) {
+  *result = ConvTexture(definition);
+  result->different_weights_for_height_ = true;
+  result->block_size_ = {4, 1, 2};
+  return result->UploadDataForWinograd4x4To6x6(
+      attr.weights, *creation_context.device, creation_context.context);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 4332659b2c2..fb25f655057 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
@@ -60,15 +61,24 @@ class ConvTexture : public GPUOperation {
                                   const OperationDef& definition,
                                   const FullyConnectedAttributes& attr,
                                   ConvTexture* result);
+
+  friend Status CreateConvTextureWino4x4To6x6(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const Convolution2DAttributes& attr, ConvTexture* result);
+
   ConvTexture(const OperationDef& definition,
               const Convolution2DAttributes& attr);
-  ConvTexture(const OperationDef& definition,
-              const FullyConnectedAttributes& attr);
+  explicit ConvTexture(const OperationDef& definition);
   template <DataType T>
   Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                     const ::tflite::gpu::Tensor<Linear, T>& biases,
                     CLContext* context);
 
+  template <DataType T>
+  Status UploadDataForWinograd4x4To6x6(
+      const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+      CLContext* context);
+
   template <DataType T>
   Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                        CLContext* context);
@@ -92,6 +102,11 @@ class ConvTexture : public GPUOperation {
   int2 padding_;
   int2 dilation_;
 
+  // By default in 2d convolution we have the same weights for WH dims, but in
+  // some cases we need separate weights for H dimension and convolution kernel
+  // requires very small modifications to support it.
+  bool different_weights_for_height_;
+
   int3 block_size_ = int3(2, 2, 2);
 
   CLKernel kernel_;
@@ -111,15 +126,35 @@ Status ConvTexture::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
   return OkStatus();
 }
 
+template <DataType T>
+Status ConvTexture::UploadDataForWinograd4x4To6x6(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+    CLContext* context) {
+  ::tflite::gpu::Tensor<OHWI, T> wino_weights;
+  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::TEXTURE_2D;
+  create_info.data_type = definition_.GetDataType();
+  create_info.aligned_size = 1;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape = Linear(1);
+  bias.data = {0.0f};
+  return CreateLinearStorage(create_info, bias, context, &biases_);
+}
+
 template <DataType T>
 Status ConvTexture::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                                   CLContext* context) {
-  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  dst_depth = AlignByN(dst_depth, block_size_.z);
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
 
   int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_size_.x * kernel_size_.y;
+  int texture_height = src_depth * kernel_x * kernel_y;
 
   DataType data_type = definition_.GetDataType();
 
@@ -170,15 +205,17 @@ template <DataType S, typename T>
 void ConvTexture::RearrangeWeightsData(
     const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
     absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
-  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  dst_depth = AlignByN(dst_depth, block_size_.z);
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
 
   int texture_width = dst_depth;
 
   for (int d = 0; d < dst_depth / block_size_.z; ++d) {
-    for (int y = 0; y < kernel_size_.y; ++y) {
-      for (int x = 0; x < kernel_size_.x; ++x) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
         for (int s = 0; s < src_depth; ++s) {
           for (int sub_d = 0; sub_d < block_size_.z; ++sub_d) {
             T filters[4];
@@ -196,7 +233,7 @@ void ConvTexture::RearrangeWeightsData(
               }
             }
             int x_coord = d * block_size_.z + sub_d;
-            int y_coord = (y * kernel_size_.x + x) * src_depth + s;
+            int y_coord = (y * kernel_x + x) * src_depth + s;
             int offset = y_coord * texture_width + x_coord;
             dst_0[offset] = filters[0];
             dst_1[offset] = filters[1];
@@ -219,6 +256,11 @@ Status CreateConvTexture(const CreationContext& creation_context,
                          const FullyConnectedAttributes& attr,
                          ConvTexture* result);
 
+Status CreateConvTextureWino4x4To6x6(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr,
+                                     ConvTexture* result);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index 9fb3e45fe81..7c394a45669 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -48,6 +48,9 @@ std::string ElementwiseOneInput::GetCoreCode(
     case OperationType::COS:
       result = "$0 = cos($0);\n";
       break;
+    case OperationType::EXP:
+      result = "$0 = exp($0);\n";
+      break;
     case OperationType::HARD_SWISH:
       result =
           "$0 *= clamp($0 * (FLT)(0.16666667f) + (FLT)(0.5f), (FLT4)(0.0f), "
@@ -213,14 +216,16 @@ Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes& attr) {
+    const ElementwiseAttributes* attr) {
   ElementwiseTwoInput operation(definition, op_type, broadcast);
-  auto scalar = absl::get_if<float>(&attr.param);
-  if (scalar) {
-    const auto scalar_precision = creation_context.device->IsPowerVR()
-                                      ? CalculationsPrecision::F32
-                                      : definition.precision;
-    operation.SetScalarPara(FLT(scalar_precision, *scalar));
+  if (attr) {
+    const float* scalar = absl::get_if<float>(&attr->param);
+    if (scalar) {
+      const auto scalar_precision = creation_context.device->IsPowerVR()
+                                        ? CalculationsPrecision::F32
+                                        : definition.precision;
+      operation.SetScalarPara(FLT(scalar_precision, *scalar));
+    }
   }
   operation.SetLinkIndex(0);
   return operation;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index a70114d1081..8bf33b0c128 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -92,7 +92,7 @@ class ElementwiseTwoInput : public ElementwiseOperation {
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes& attr);
+    const ElementwiseAttributes* attr);
 
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const OperationDef& definition, const OperationType& op_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index aa1f83cc495..d558f2a6bd4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -81,6 +81,33 @@ TEST_F(OpenCLOperationTest, Cos) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Exp) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 7);
+  src_tensor.data = {0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseOneInput operation =
+          CreateElementwiseOneInput(op_def, OperationType::EXP);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 7), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps),
+                            {std::exp(0.0f), std::exp(1.0f), std::exp(-1.0f),
+                             std::exp(100.0f), std::exp(-100.0f),
+                             std::exp(0.01f), std::exp(-0.01f)}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, HardSwish) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index a22037d46b6..9dd0546c059 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -104,6 +104,9 @@ Mean& Mean::operator=(Mean&& operation) {
 }
 
 Status Mean::Compile(const CreationContext& creation_context) {
+  if (creation_context.device->IsAdreno3xx()) {
+    work_group_size_ = int3(16, 8, 1);
+  }
   const auto code =
       GetMeanKernelCode(definition_, linked_operations_, work_group_size_);
   return creation_context.cache->GetOrCreateCLKernel(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
new file mode 100644
index 00000000000..db6882ce4f4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetSpaceToDepthCode(
+    const OperationDef& op_def,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor(
+      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
+      op_def.src_tensors[0]);
+  TensorCodeGenerator dst_tensor(
+      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
+      op_def.dst_tensors[0]);
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,\n";
+  c += "    int4 dst_size,\n";
+  c += "    int src_channels,\n";
+  c += "    int block_size) {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  FLT tmp[4];\n";
+  c += "  tmp[0] = (FLT)(0.0f);\n";
+  c += "  tmp[1] = (FLT)(0.0f);\n";
+  c += "  tmp[2] = (FLT)(0.0f);\n";
+  c += "  tmp[3] = (FLT)(0.0f);\n";
+  c += "  for (int i = 0; i < 4; ++i) {\n";
+  c += "    int dst_c = 4 * Z + i;\n";
+  c += "    int block_id = dst_c / src_channels;\n";
+  c += "    int src_x = X * block_size + block_id % block_size;\n";
+  c += "    int src_y = Y * block_size + block_id / block_size;\n";
+  c += "    int src_c = dst_c % src_channels;\n";
+  c += "    int src_z = src_c / 4;\n";
+  c += "    FLT4 t = " + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
+  c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+  c += "    tmp[i] = t_ar[src_c % 4];\n";
+  c += "  }\n";
+  c += "  FLT4 result = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);\n";
+  const LinkingContext context = {
+      .var_name = "result",
+      .x_coord = "X",
+      .y_coord = "Y",
+      .s_coord = "Z",
+  };
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "}\n";
+  return c;
+}
+
+}  // namespace
+
+SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
+    : GPUOperation(std::move(operation)),
+      attr_(operation.attr_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status SpaceToDepth::Compile(const CreationContext& creation_context) {
+  const auto code = GetSpaceToDepthCode(definition_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status SpaceToDepth::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
+  return kernel_.SetBytesAuto(attr_.block_size);
+}
+
+int3 SpaceToDepth::GetGridSize() const {
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Slices();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status SpaceToDepth::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status SpaceToDepth::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+                                const SpaceToDepthAttributes& attr) {
+  return SpaceToDepth(op_def, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
new file mode 100644
index 00000000000..3d316569fcb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPACE_TO_DEPTH_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class SpaceToDepth : public GPUOperation {
+ public:
+  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
+      : GPUOperation(op_def), attr_(attr), work_group_size_(8, 4, 1) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+  Status Compile(const CreationContext& creation_context) override;
+
+  SpaceToDepth(SpaceToDepth&& operation);
+  SpaceToDepth& operator=(SpaceToDepth&& operation);
+  SpaceToDepth(const SpaceToDepth&) = delete;
+  SpaceToDepth& operator=(const SpaceToDepth&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  SpaceToDepthAttributes attr_;
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+                                const SpaceToDepthAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
new file mode 100644
index 00000000000..02d93582ede
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+/*
+// A known Qualcomm Adreno bug makes the 1 channel test fail on old devices.
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x1BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {half(1.0f), half(2.0f), half(3.0f), half(4.0f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 4), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(1e-6),
+                            {half(1.0f), half(2.0f), half(3.0f), half(4.0f)}));
+    }
+  }
+}
+*/
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x2BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {half(1.4f), half(2.3f), half(3.2f), half(4.1f),
+                     half(5.4f), half(6.3f), half(7.2f), half(8.1f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 8), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(1e-6),
+                            {half(1.4f), half(2.3f), half(3.2f), half(4.1f),
+                             half(5.4f), half(6.3f), half(7.2f), half(8.1f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x3BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 3);
+  src_tensor.data = {half(1.0f), half(2.0f),  half(3.0f),  half(4.0f),
+                     half(5.0f), half(6.0f),  half(7.0f),  half(8.0f),
+                     half(9.0f), half(10.0f), half(11.0f), half(12.0f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 12), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(1e-6), {half(1.0f), half(2.0f), half(3.0f),  //
+                                      half(4.0f), half(5.0f), half(6.0f),  //
+                                      half(7.0f), half(8.0f), half(9.0f),  //
+                                      half(10.0f), half(11.0f), half(12.0f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x4x4x1BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 4, 4, 1);
+  src_tensor.data = {half(1.0f),  half(2.0f),  half(5.0f),  half(6.0f),
+                     half(3.0f),  half(4.0f),  half(7.0f),  half(8.0f),
+                     half(9.0f),  half(10.0f), half(13.0f), half(14.0f),
+                     half(11.0f), half(12.0f), half(15.0f), half(16.0f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 4), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(1e-6),
+                    {half(1.0f), half(2.0f), half(3.0f), half(4.0f),     //
+                     half(5.0f), half(6.0f), half(7.0f), half(8.0f),     //
+                     half(9.0f), half(10.0f), half(11.0f), half(12.0f),  //
+                     half(13.0f), half(14.0f), half(15.0f), half(16.0f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 0943816f2d7..8cb8d615787 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -711,6 +711,16 @@ void RearrangeWeightsToWinograd4x4To6x6Weights(
   }
 }
 
+int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
+  for (const auto& wg : wgs) {
+    const int wg_size = wg.x * wg.y * wg.z;
+    if (wg_size <= max_wg_size) {
+      return wg;
+    }
+  }
+  return {1, 1, 1};
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 02d5df6c442..f5f3d532896 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -301,6 +301,10 @@ TextureAddressMode GetFastestZeroMode(const CLDevice& device);
 // but 8s-channel will be empty, then last plane (batch of 4 channels) will
 // have this mask (1, 1, 1, 0).
 float4 GetMaskForLastPlane(int channels);
+
+// returns first work group from wgs that has size not bigger than max_wg_size
+// if no suitable groups among wgs, returns {1, 1, 1}
+int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 868cca55882..9bb89874c3d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
@@ -385,12 +386,18 @@ Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
   if (creation_context.device->IsAdreno()) {
     options.push_back(CompilerOptions::ADRENO_MORE_WAVES);
   }
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsPowerVR()) {
+    options.push_back(CompilerOptions::POWERVR_FP16);
+  }
   RETURN_IF_ERROR(UploadBt(creation_context.context));
   const auto code =
       GetWinograd4x4To36Code(definition_, bt_, linked_operations_);
-  return creation_context.cache->GetOrCreateCLKernel(
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
-      *creation_context.device, &kernel_);
+      *creation_context.device, &kernel_));
+  work_group_size_ = SelectBestWorkGroup();
+  return OkStatus();
 }
 
 Status Winograd4x4To36::UploadBt(CLContext* context) {
@@ -413,6 +420,13 @@ Status Winograd4x4To36::UploadBt(CLContext* context) {
   return CreateLinearStorage(create_info, bt_aligned, context, &bt_);
 }
 
+int3 Winograd4x4To36::SelectBestWorkGroup() {
+  const std::vector<int3> wgs = {{8, 6, 4}, {8, 6, 2}, {4, 6, 2},
+                                 {4, 6, 2}, {2, 6, 2}, {2, 6, 1},
+                                 {1, 6, 1}, {1, 3, 1}, {1, 1, 1}};
+  return GetFirstSuitableWorkGroup(wgs, kernel_.GetMaxWorkGroupSize());
+}
+
 Status Winograd4x4To36::BindArguments() {
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
@@ -442,8 +456,16 @@ int3 Winograd4x4To36::GetGridSize() const {
 }
 
 Status Winograd4x4To36::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+  switch (params.tuning_type) {
+    case TuningType::EXHAUSTIVE:
+      RETURN_IF_ERROR(BindArguments());
+      return GetBestWorkGroup(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+    case TuningType::FAST:
+    default:
+      work_group_size_ = SelectBestWorkGroup();
+      return OkStatus();
+  }
 }
 
 Status Winograd4x4To36::AddToQueue(CLCommandQueue* queue) {
@@ -478,11 +500,18 @@ Winograd36To4x4& Winograd36To4x4::operator=(Winograd36To4x4&& operation) {
 }
 
 Status Winograd36To4x4::Compile(const CreationContext& creation_context) {
+  std::vector<CompilerOptions> options;
+  if (definition_.precision == CalculationsPrecision::F16 &&
+      creation_context.device->IsPowerVR()) {
+    options.push_back(CompilerOptions::POWERVR_FP16);
+  }
   const auto code =
       GetWinograd36To4x4Code(definition_, at_, biases_, linked_operations_);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", options, *creation_context.context,
+      *creation_context.device, &kernel_));
+  work_group_size_ = SelectBestWorkGroup();
+  return OkStatus();
 }
 
 Status Winograd36To4x4::UploadAt(CLContext* context) {
@@ -505,6 +534,13 @@ Status Winograd36To4x4::UploadAt(CLContext* context) {
   return CreateLinearStorage(create_info, at_aligned, context, &at_);
 }
 
+int3 Winograd36To4x4::SelectBestWorkGroup() {
+  const std::vector<int3> wgs = {{32, 4, 2}, {16, 4, 2}, {16, 4, 1},
+                                 {8, 4, 1},  {4, 4, 1},  {2, 4, 1},
+                                 {1, 4, 1},  {1, 2, 1},  {1, 1, 1}};
+  return GetFirstSuitableWorkGroup(wgs, kernel_.GetMaxWorkGroupSize());
+}
+
 Status Winograd36To4x4::BindArguments() {
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
@@ -530,8 +566,16 @@ int3 Winograd36To4x4::GetGridSize() const {
 }
 
 Status Winograd36To4x4::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+  switch (params.tuning_type) {
+    case TuningType::EXHAUSTIVE:
+      RETURN_IF_ERROR(BindArguments());
+      return GetBestWorkGroup(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+    case TuningType::FAST:
+    default:
+      work_group_size_ = SelectBestWorkGroup();
+      return OkStatus();
+  }
 }
 
 Status Winograd36To4x4::AddToQueue(CLCommandQueue* queue) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index baa758ac6d8..f6b80b67f32 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -54,6 +54,9 @@ class Winograd4x4To36 : public GPUOperation {
 
   Status UploadBt(CLContext* context);
 
+  // Must be called after kernel compilation
+  int3 SelectBestWorkGroup();
+
   Status BindArguments();
   int3 GetGridSize() const;
 
@@ -91,6 +94,9 @@ class Winograd36To4x4 : public GPUOperation {
 
   Status UploadAt(CLContext* context);
 
+  // Must be called after kernel compilation
+  int3 SelectBestWorkGroup();
+
   Status BindArguments();
   int3 GetGridSize() const;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index 3f0a6ceff74..7233fa87238 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -77,7 +77,12 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      float eps;
+      if (precision == CalculationsPrecision::F32) {
+        eps = 1e-5f * (env_.device().SupportsFP32RTN() ? 1.0f : 4.0f);
+      } else {
+        eps = 1e-2f * (env_.device().SupportsFP16RTN() ? 1.0f : 4.0f);
+      }
       OperationDef op_def;
       op_def.precision = precision;
       auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -144,7 +149,12 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      float eps;
+      if (precision == CalculationsPrecision::F32) {
+        eps = 1e-5f * (env_.device().SupportsFP32RTN() ? 1.0f : 4.0f);
+      } else {
+        eps = 1e-2f * (env_.device().SupportsFP16RTN() ? 1.0f : 4.0f);
+      }
       OperationDef op_def;
       op_def.precision = precision;
       auto data_type = DeduceDataTypeFromPrecision(precision);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
index a09236f77fc..7a2e54840b9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.cc
@@ -248,13 +248,8 @@ Status GetBestWorkGroup(const TuningParameters& params, const CLKernel& kernel,
                         const int3& grid, int3* best_work_group) {
   switch (params.tuning_type) {
     case TuningType::FAST:
-      if (params.info->vendor != Vendor::QUALCOMM) {
-        *best_work_group = int3(8, 4, 1);
-        return OkStatus();
-      } else {
-        *best_work_group = GetWorkGroup(grid, kernel.GetMaxWorkGroupSize());
-        return OkStatus();
-      }
+      *best_work_group = GetWorkGroup(grid, kernel.GetMaxWorkGroupSize());
+      return OkStatus();
     case TuningType::EXHAUSTIVE:
       return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
                                            best_work_group);
@@ -268,16 +263,16 @@ Status GetBestWorkGroupConv(const TuningParameters& params,
                             const CLKernel& kernel, const int3& grid,
                             int3* best_work_group) {
   switch (params.tuning_type) {
-    case TuningType::FAST:
-      if (params.info->vendor != Vendor::QUALCOMM) {
-        *best_work_group = int3(8, 4, 1);
-        return OkStatus();
-      } else {
-        int max_z_size = params.info->adreno_info.gpu_version < 400 ? 16 : 64;
-        *best_work_group =
-            GetWorkGroupConv(grid, kernel.GetMaxWorkGroupSize(), max_z_size);
-        return OkStatus();
+    case TuningType::FAST: {
+      int max_z_size = 16;
+      if (params.info->vendor == Vendor::QUALCOMM) {
+        max_z_size = params.info->adreno_info.gpu_version < 400 ? 16 : 64;
       }
+      max_z_size = std::min(max_z_size, params.info->max_work_group_sizes.z);
+      *best_work_group =
+          GetWorkGroupConv(grid, kernel.GetMaxWorkGroupSize(), max_z_size);
+      return OkStatus();
+    }
     case TuningType::EXHAUSTIVE:
       return GetBestWorkGroupAlignedToGrid(params, kernel, grid,
                                            best_work_group);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 293a34df4a5..908c1b91583 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -10,7 +10,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
@@ -84,7 +83,9 @@ cc_library(
         ":dw_convolution_selector",
         ":fully_connected_selector",
         ":simple_selectors",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
+        "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:elementwise",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
@@ -120,8 +121,10 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:resize",
         "//tensorflow/lite/delegates/gpu/cl/kernels:softmax",
         "//tensorflow/lite/delegates/gpu/cl/kernels:softmax1x1",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:space_to_depth",
         "//tensorflow/lite/delegates/gpu/cl/kernels:strided_slice",
         "//tensorflow/lite/delegates/gpu/cl/kernels:transpose",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:winograd",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index a70aa3f9901..0103ca08b90 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
@@ -48,6 +47,20 @@ Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
   return OkStatus();
 }
 
+Status SelectConvolutionWinogradAdreno(const Convolution2DAttributes& attr,
+                                       const BHWC& dst_shape,
+                                       const CreationContext& creation_context,
+                                       const OperationDef& op_def,
+                                       ModelHints hints,
+                                       std::unique_ptr<GPUOperation>* ptr) {
+  ConvTexture conv;
+  RETURN_IF_ERROR(
+      CreateConvTextureWino4x4To6x6(creation_context, op_def, attr, &conv));
+  *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+
+  return OkStatus();
+}
+
 Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
                                const CreationContext& creation_context,
                                const OperationDef& op_def,
@@ -83,17 +96,32 @@ Status SelectConvolutionMali(const Convolution2DAttributes& attr,
     ConvBuffer1x1 conv;
     RETURN_IF_ERROR(CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
     *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
-  } else if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    ConvBuffer conv;
-    RETURN_IF_ERROR(CreateConvBuffer(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvBuffer>(std::move(conv));
   } else {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   }
   return OkStatus();
 }
+
+Status SelectConvolutionWinogradMali(const Convolution2DAttributes& attr,
+                                     const CreationContext& creation_context,
+                                     const OperationDef& op_def,
+                                     std::unique_ptr<GPUOperation>* ptr) {
+  if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
+    ConvBuffer1x1 conv;
+    RETURN_IF_ERROR(
+        CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+  } else {
+    ConvPowerVR conv;
+    RETURN_IF_ERROR(
+        CreateConvPowerVRWino4x4To6x6(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+  }
+
+  return OkStatus();
+}
 }  // namespace
 
 Status SelectConvolution(const Convolution2DAttributes& attr,
@@ -118,6 +146,33 @@ Status SelectConvolution(const Convolution2DAttributes& attr,
   }
 }
 
+Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
+                                    const BHWC& dst_shape,
+                                    const CreationContext& creation_context,
+                                    const OperationDef& op_def,
+                                    ModelHints hints,
+                                    std::unique_ptr<GPUOperation>* ptr) {
+  switch (creation_context.device->vendor()) {
+    case Vendor::QUALCOMM:
+      return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
+                                             op_def, hints, ptr);
+    case Vendor::POWERVR:
+    case Vendor::AMD:
+    case Vendor::NVIDIA: {
+      ConvPowerVR conv;
+      RETURN_IF_ERROR(
+          CreateConvPowerVRWino4x4To6x6(creation_context, op_def, attr, &conv));
+      *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+      return OkStatus();
+    }
+    case Vendor::MALI:
+      return SelectConvolutionWinogradMali(attr, creation_context, op_def, ptr);
+    default:
+      return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
+                                             op_def, hints, ptr);
+  }
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
index 7dd6c79eea0..dc0657ec47c 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
@@ -34,6 +34,13 @@ Status SelectConvolution(const Convolution2DAttributes& attr,
                          const OperationDef& op_def, ModelHints hints,
                          std::unique_ptr<GPUOperation>* ptr);
 
+Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
+                                    const BHWC& dst_shape,
+                                    const CreationContext& creation_context,
+                                    const OperationDef& op_def,
+                                    ModelHints hints,
+                                    std::unique_ptr<GPUOperation>* ptr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 3153d7ddfd8..29c246a2744 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h"
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -51,6 +53,111 @@ bool IsChannelsBroadcastedForSecondInput(
          inputs[0]->tensor.shape.c != inputs[1]->tensor.shape.c &&
          inputs[1]->tensor.shape.c == 1;
 }
+
+bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
+                                   const CLDevice& device,
+                                   const BHWC& dst_shape) {
+  const int tiles_x = IntegralDivideRoundUp(dst_shape.w, 4);
+  const int tiles_y = IntegralDivideRoundUp(dst_shape.h, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const bool suitable_attributes =
+      attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
+      attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
+  const int min_depth = 32;
+  const bool recommended_channels =
+      dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
+  const bool recommended_hw = tiles_x * tiles_y >= 128;
+  return suitable_attributes && recommended_channels && recommended_hw;
+}
+
+Status WinogradFromNode(const CreationContext& creation_context,
+                        const OperationDef& op_def, ModelHints hints,
+                        const BHWC& input_shape, const BHWC& output_shape,
+                        const Convolution2DAttributes& attr,
+                        GPUOperationsSubgraph* gpu_subgraph) {
+  if (!IsSuitableForWinograd4x4To6x6(attr, *creation_context.device,
+                                     output_shape)) {
+    return UnimplementedError("No implementation for this case.");
+  }
+
+  const int tiles_x = IntegralDivideRoundUp(output_shape.w, 4);
+  const int tiles_y = IntegralDivideRoundUp(output_shape.h, 4);
+  const BHWC shape_0{input_shape.b, 36, tiles_x * tiles_y, input_shape.c};
+  const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
+  TensorDescriptor td_0;
+  td_0.storage_type = SelectBestStorageType(
+      *creation_context.context, *creation_context.device, shape_0,
+      op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
+      op_def.src_tensors[0].layout);
+  td_0.data_type = op_def.src_tensors[0].data_type;
+  td_0.layout = op_def.src_tensors[0].layout;
+  TensorDescriptor td_1;
+  td_1.storage_type = SelectBestStorageType(
+      *creation_context.context, *creation_context.device, shape_1,
+      op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
+      op_def.src_tensors[0].layout);
+  td_1.data_type = op_def.src_tensors[0].data_type;
+  td_1.layout = op_def.src_tensors[0].layout;
+  gpu_subgraph->new_tensors = {{shape_0, td_0}, {shape_1, td_1}};
+  gpu_subgraph->operations.clear();
+  gpu_subgraph->operations.resize(3);
+
+  OperationDef winograd_up_def;
+  winograd_up_def.precision = op_def.precision;
+  winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
+  winograd_up_def.dst_tensors.push_back(td_0);
+  auto& winograd_up = gpu_subgraph->operations[0];
+  RETURN_IF_ERROR(SelectWinograd4x4To36(
+      creation_context, attr.padding, winograd_up_def, &winograd_up.operation));
+  winograd_up.input_ids = {0};
+  winograd_up.output_ids = {-1};
+
+  OperationDef conv_def;
+  conv_def.precision = op_def.precision;
+  conv_def.src_tensors.push_back(td_0);
+  conv_def.dst_tensors.push_back(td_1);
+  auto& conv = gpu_subgraph->operations[1];
+  conv.input_ids = {-1};
+  conv.output_ids = {-2};
+  RETURN_IF_ERROR(SelectConvolutionForWinograd(
+      attr, input_shape, creation_context, conv_def, hints, &conv.operation));
+
+  OperationDef winograd_down_def;
+  winograd_down_def.precision = op_def.precision;
+  winograd_down_def.src_tensors.push_back(td_1);
+  winograd_down_def.dst_tensors.push_back(op_def.dst_tensors[0]);
+  auto& winograd_down = gpu_subgraph->operations[2];
+  winograd_down.input_ids = {-2};
+  winograd_down.output_ids = {0};
+  auto bias_copy = attr.bias;
+  if (bias_copy.shape.v < attr.weights.shape.o) {
+    bias_copy.shape = Linear(attr.weights.shape.o);
+    bias_copy.data.resize(attr.weights.shape.o);
+  }
+  RETURN_IF_ERROR(SelectWinograd36To4x4(creation_context, winograd_down_def,
+                                        bias_copy, &winograd_down.operation));
+
+  return OkStatus();
+}
+
+std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
+    const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+    const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+    GPUOperationsSubgraph* gpu_subgraph) {
+  gpu_subgraph->operations.clear();
+  gpu_subgraph->new_tensors.clear();
+  gpu_subgraph->operations.push_back({});
+  for (int i = 0; i < inputs.size(); ++i) {
+    gpu_subgraph->operations[0].input_ids.push_back(i);
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    gpu_subgraph->operations[0].output_ids.push_back(i);
+  }
+
+  return &gpu_subgraph->operations[0].operation;
+}
+
 }  // namespace
 
 Status GPUOperationFromNode(const CreationContext& creation_context,
@@ -59,15 +166,8 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
                             const std::vector<Value<TensorRef<BHWC>>*>& outputs,
                             const Node& node,
                             GPUOperationsSubgraph* gpu_subgraph) {
-  gpu_subgraph->operations.push_back({});
   std::unique_ptr<GPUOperation>* gpu_op =
-      &gpu_subgraph->operations[0].operation;
-  for (int i = 0; i < inputs.size(); ++i) {
-    gpu_subgraph->operations[0].input_ids.push_back(i);
-  }
-  for (int i = 0; i < outputs.size(); ++i) {
-    gpu_subgraph->operations[0].output_ids.push_back(i);
-  }
+      InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
   auto op_type = OperationTypeFromString(node.operation.type);
   switch (op_type) {
     case OperationType::ADD: {
@@ -111,9 +211,17 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::CONVOLUTION_2D: {
       auto attr =
           absl::any_cast<Convolution2DAttributes>(node.operation.attributes);
-      auto input = inputs[0];
-      return SelectConvolution(attr, input->tensor.shape, creation_context,
-                               op_def, hints, gpu_op);
+      auto input_shape = inputs[0]->tensor.shape;
+      auto output_shape = outputs[0]->tensor.shape;
+      if (WinogradFromNode(creation_context, op_def, hints, input_shape,
+                           output_shape, attr, gpu_subgraph)
+              .ok()) {
+        return OkStatus();
+      } else {
+        gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
+        return SelectConvolution(attr, input_shape, creation_context, op_def,
+                                 hints, gpu_op);
+      }
     }
     case OperationType::CONVOLUTION_TRANSPOSED: {
       auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
@@ -196,6 +304,10 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       SelectReshape(src_channels, attr.new_shape.c, op_def, gpu_op);
       return OkStatus();
     }
+    case OperationType::RESIZE: {
+      auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
+      return SelectResize(attr, op_def, gpu_op);
+    }
     case OperationType::SLICE: {
       auto attr = absl::any_cast<SliceAttributes>(node.operation.attributes);
       SelectStridedSlice(attr, op_def, gpu_op);
@@ -205,18 +317,21 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       SelectSoftmax(inputs[0]->tensor.shape, op_def, gpu_op);
       return OkStatus();
     }
+    case OperationType::SPACE_TO_DEPTH: {
+      auto attr =
+          absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
+      SelectSpaceToDepth(attr, op_def, gpu_op);
+      return OkStatus();
+    }
     case OperationType::TRANSPOSE: {
       auto attr =
           absl::any_cast<TransposeAttributes>(node.operation.attributes);
       SelectTranspose(attr, op_def, gpu_op);
       return OkStatus();
     }
-    case OperationType::RESIZE: {
-      auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
-      return SelectResize(attr, op_def, gpu_op);
-    }
     case OperationType::ABS:
     case OperationType::COS:
+    case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
     case OperationType::RSQRT:
@@ -240,8 +355,8 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
       broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
       broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-      const auto attr =
-          absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+      const ElementwiseAttributes* attr =
+          absl::any_cast<ElementwiseAttributes>(&node.operation.attributes);
       ElementwiseTwoInput operation = CreateElementwiseTwoInput(
           creation_context, op_def, op_type, broadcast, attr);
       *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index 1dde6e514a8..22244351bd7 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/resize.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/softmax.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/transpose.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/winograd.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -125,6 +127,13 @@ void SelectReshape(int src_channels, int dst_channels,
   }
 }
 
+void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr) {
+  SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+  *ptr = absl::make_unique<SpaceToDepth>(std::move(operation));
+}
+
 void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
   Padding operation = CreatePadding(op_def, attr);
@@ -187,6 +196,28 @@ void SelectTranspose(const TransposeAttributes& attr,
   *ptr = absl::make_unique<Transpose>(std::move(operation));
 }
 
+Status SelectWinograd4x4To36(const CreationContext& creation_context,
+                             const Padding2D& padding,
+                             const OperationDef& op_def,
+                             std::unique_ptr<GPUOperation>* ptr) {
+  Winograd4x4To36 operation;
+  RETURN_IF_ERROR(
+      CreateWinograd4x4To36(creation_context, op_def, padding, &operation));
+  *ptr = absl::make_unique<Winograd4x4To36>(std::move(operation));
+  return OkStatus();
+}
+
+Status SelectWinograd36To4x4(
+    const CreationContext& creation_context, const OperationDef& op_def,
+    const ::tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
+    std::unique_ptr<GPUOperation>* ptr) {
+  Winograd36To4x4 operation;
+  RETURN_IF_ERROR(
+      CreateWinograd36To4x4(creation_context, op_def, biases, &operation));
+  *ptr = absl::make_unique<Winograd36To4x4>(std::move(operation));
+  return OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index a9cc7c2fe7b..fd29ebc0e91 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -82,10 +82,24 @@ Status SelectBroadcastAdd(const AddAttributes& attr,
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr);
 
+void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
 void SelectTranspose(const TransposeAttributes& attr,
                      const OperationDef& op_def,
                      std::unique_ptr<GPUOperation>* ptr);
 
+Status SelectWinograd4x4To36(const CreationContext& creation_context,
+                             const Padding2D& padding,
+                             const OperationDef& op_def,
+                             std::unique_ptr<GPUOperation>* ptr);
+
+Status SelectWinograd36To4x4(
+    const CreationContext& creation_context, const OperationDef& op_def,
+    const ::tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
+    std::unique_ptr<GPUOperation>* ptr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
new file mode 100644
index 00000000000..26eb3ad3538
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.cc
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
+                              const BHWDC& shape,
+                              const TensorDescriptor& descriptor) {
+  const int slices = IntegralDivideRoundUp(shape.c, 4);
+  switch (descriptor.storage_type) {
+    case TensorStorageType::BUFFER: {
+      const int flt4_size =
+          4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
+      const int buffer_size =
+          shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
+      return buffer_size <= device.GetInfo().buffer_max_size;
+    }
+    case TensorStorageType::IMAGE_BUFFER:
+      return shape.b * shape.w * shape.h * shape.d * slices <=
+             device.GetInfo().image_buffer_max_size;
+    case TensorStorageType::TEXTURE_3D:
+      if (device.cl_version() < OpenCLVersion::CL_1_2 && slices == 1) {
+        // clCreateImage3D (that used in CL 1.0/1.1) can not create image with
+        // depth = 1 by specification;
+        return false;
+      }
+      return shape.w * shape.b <= device.GetInfo().image3d_max_width &&
+             shape.h <= device.GetInfo().image3d_max_height &&
+             slices * shape.d <= device.GetInfo().image3d_max_depth;
+    case TensorStorageType::TEXTURE_ARRAY:
+      // Bug on some Adreno. b/131099086
+      if (slices == 1 && !device.SupportsOneLayerTextureArray()) {
+        return false;
+      }
+      return shape.w * shape.b <= device.GetInfo().image2d_max_width &&
+             shape.h <= device.GetInfo().image2d_max_height &&
+             slices * shape.d <= device.GetInfo().image_array_max_layers;
+    case TensorStorageType::TEXTURE_2D:
+      return shape.w * shape.b * shape.d <=
+                 device.GetInfo().image2d_max_width &&
+             shape.h * slices <= device.GetInfo().image2d_max_height;
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return shape.c <= 4 &&
+             context.IsFloatTexture2DSupported(shape.c, descriptor.data_type) &&
+             shape.w * shape.b * shape.d <=
+                 device.GetInfo().image2d_max_width &&
+             shape.h <= device.GetInfo().image2d_max_height;
+    default:
+      return false;
+  }
+}
+
+bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
+                              const BHWC& shape,
+                              const TensorDescriptor& descriptor) {
+  const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+  return CanCreateTensorWithShape(context, device, shape5D, descriptor);
+}
+
+TensorStorageType SelectBestStorageType(const CLContext& context,
+                                        const CLDevice& device,
+                                        const BHWC& shape,
+                                        const TensorStorageType& desired,
+                                        const DataType& data_type,
+                                        const Layout& layout) {
+  if (CanCreateTensorWithShape(context, device, shape,
+                               TensorDescriptor{data_type, desired, layout})) {
+    return desired;
+  }
+  auto GetBestTypeAfterTextureArray = [&]() {
+    if (device.SupportsImageBuffer() &&
+        CanCreateTensorWithShape(
+            context, device, shape,
+            TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER,
+                             layout})) {
+      return TensorStorageType::IMAGE_BUFFER;
+    } else {
+      return TensorStorageType::BUFFER;
+    }
+  };
+  auto GetBestTypeAfterTexture2D = [&]() {
+    if (device.SupportsTextureArray() &&
+        CanCreateTensorWithShape(
+            context, device, shape,
+            TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
+                             layout})) {
+      return TensorStorageType::TEXTURE_ARRAY;
+    } else {
+      return GetBestTypeAfterTextureArray();
+    }
+  };
+  auto GetBestTypeAfterTexture3D = [&]() {
+    if (CanCreateTensorWithShape(
+            context, device, shape,
+            TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D,
+                             layout})) {
+      return TensorStorageType::TEXTURE_2D;
+    } else {
+      return GetBestTypeAfterTexture2D();
+    }
+  };
+  switch (desired) {
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return GetBestTypeAfterTexture2D();
+    case TensorStorageType::TEXTURE_ARRAY:
+      return GetBestTypeAfterTextureArray();
+    case TensorStorageType::TEXTURE_3D:
+      return GetBestTypeAfterTexture3D();
+    case TensorStorageType::IMAGE_BUFFER:
+    case TensorStorageType::BUFFER:
+      return TensorStorageType::BUFFER;
+    default:
+      return TensorStorageType::BUFFER;
+  }
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/storage_type_util.h b/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
new file mode 100644
index 00000000000..87fc2206e81
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/storage_type_util.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
+                              const BHWDC& shape,
+                              const TensorDescriptor& descriptor);
+
+bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
+                              const BHWC& shape,
+                              const TensorDescriptor& descriptor);
+
+TensorStorageType SelectBestStorageType(const CLContext& context,
+                                        const CLDevice& device,
+                                        const BHWC& shape,
+                                        const TensorStorageType& desired,
+                                        const DataType& data_type,
+                                        const Layout& layout);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_STORAGE_TYPE_UTIL_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 8423613440e..610ba407eb9 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -331,60 +331,6 @@ Status Tensor::ReadData(CLCommandQueue* queue, Tensor5DFloat32* dst) const {
   return ReadDataBHWDC(absl::MakeSpan(dst->data), queue);
 }
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWC& shape,
-                              const TensorDescriptor& descriptor) {
-  const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return CanCreateTensorWithShape(context, device, shape5D, descriptor);
-}
-
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWDC& shape,
-                              const TensorDescriptor& descriptor) {
-  const int slices = IntegralDivideRoundUp(shape.c, 4);
-  switch (descriptor.storage_type) {
-    case TensorStorageType::BUFFER: {
-      const int flt4_size =
-          4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
-      const int buffer_size =
-          shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
-      return buffer_size <= device.GetInfo().buffer_max_size;
-    }
-    case TensorStorageType::IMAGE_BUFFER:
-      return shape.b * shape.w * shape.h * shape.d * slices <=
-             device.GetInfo().image_buffer_max_size;
-    case TensorStorageType::TEXTURE_3D:
-      if (device.cl_version() < OpenCLVersion::CL_1_2 && slices == 1) {
-        // clCreateImage3D (that used in CL 1.0/1.1) can not create image with
-        // depth = 1 by specification;
-        return false;
-      }
-      return shape.w * shape.b <= device.GetInfo().image3d_max_width &&
-             shape.h <= device.GetInfo().image3d_max_height &&
-             slices * shape.d <= device.GetInfo().image3d_max_depth;
-    case TensorStorageType::TEXTURE_ARRAY:
-      // Bug on some Adreno. b/131099086
-      if (slices == 1 && !device.SupportsOneLayerTextureArray()) {
-        return false;
-      }
-      return shape.w * shape.b <= device.GetInfo().image2d_max_width &&
-             shape.h <= device.GetInfo().image2d_max_height &&
-             slices * shape.d <= device.GetInfo().image_array_max_layers;
-    case TensorStorageType::TEXTURE_2D:
-      return shape.w * shape.b * shape.d <=
-                 device.GetInfo().image2d_max_width &&
-             shape.h * slices <= device.GetInfo().image2d_max_height;
-    case TensorStorageType::SINGLE_TEXTURE_2D:
-      return shape.c <= 4 &&
-             context.IsFloatTexture2DSupported(shape.c, descriptor.data_type) &&
-             shape.w * shape.b * shape.d <=
-                 device.GetInfo().image2d_max_width &&
-             shape.h <= device.GetInfo().image2d_max_height;
-    default:
-      return false;
-  }
-}
-
 Status CreateTensor(const CLContext& context, const CLDevice& device,
                     const BHWC& shape, const TensorDescriptor& descriptor,
                     Tensor* result) {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index efc09480a39..34a45436386 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -145,14 +145,6 @@ class Tensor {
 
 using TensorPtr = std::shared_ptr<Tensor>;
 
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWC& shape,
-                              const TensorDescriptor& descriptor);
-
-bool CanCreateTensorWithShape(const CLContext& context, const CLDevice& device,
-                              const BHWDC& shape,
-                              const TensorDescriptor& descriptor);
-
 Status AllocateTensorMemory(const CLContext& context, const CLDevice& device,
                             const BHWC& shape,
                             const TensorDescriptor& descriptor,
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 4a0dafe9233..9cc0e4c70c6 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -11,6 +11,7 @@ cc_binary(
         "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:environment",
         "//tensorflow/lite/delegates/gpu/cl:inference_context",
+        "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 4f67e3d4a2d..f231cf3143a 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@@ -122,7 +123,9 @@ Status RunModelSample(const std::string& model_name) {
   RETURN_IF_ERROR(CreateEnvironment(&env));
 
   InferenceContext::CreateInferenceInfo create_info;
-  create_info.precision = CalculationsPrecision::F16;
+  create_info.precision = env.IsSupported(CalculationsPrecision::F16)
+                              ? CalculationsPrecision::F16
+                              : CalculationsPrecision::F32;
   create_info.storage_type = GetFastestStorageType(env.device());
   std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
   std::cout << "Storage type: " << ToString(create_info.storage_type)
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 73d7e8821e8..703343e7a0b 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1193,6 +1193,7 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
     switch (operation_type_) {
       case OperationType::ABS:
       case OperationType::COS:
+      case OperationType::EXP:
       case OperationType::LOG:
       case OperationType::RSQRT:
       case OperationType::SIGMOID:
@@ -1872,50 +1873,6 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   SamplingType sampling_type_ = SamplingType::UNKNOWN;
 };
 
-class SoftmaxOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-    TfLiteSoftmaxParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    if (tf_options->beta != 1) {
-      // TODO(eignasheva): figure out, what's wrong with softmax.
-      return UnimplementedError("Softmax.beta != 1 is not supported.");
-    }
-    return OkStatus();
-  }
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::SOFTMAX);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    if (tf_options->beta != 1) {
-      // there is multiply by scalar operation fused in softmax. Make a layer
-      // out of it before softmax.
-      return UnimplementedError("Softmax.beta != 1 is not supported.");
-      // auto mul_node = reader->NewPassthroughNode(node);
-      // mul_node->operation.type = ToString(OperationType::MUL);
-    }
-    SoftmaxAttributes attr;
-    attr.axis = Axis::CHANNELS;  // always by channels
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-};
-
 class SliceOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -1995,6 +1952,86 @@ class SliceOperationParser : public TFLiteOperationParser {
   }
 };
 
+class SoftmaxOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    TfLiteSoftmaxParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    if (tf_options->beta != 1) {
+      // TODO(eignasheva): figure out, what's wrong with softmax.
+      return UnimplementedError("Softmax.beta != 1 is not supported.");
+    }
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SOFTMAX);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const auto* tf_options =
+        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
+    }
+    if (tf_options->beta != 1) {
+      // there is multiply by scalar operation fused in softmax. Make a layer
+      // out of it before softmax.
+      return UnimplementedError("Softmax.beta != 1 is not supported.");
+      // auto mul_node = reader->NewPassthroughNode(node);
+      // mul_node->operation.type = ToString(OperationType::MUL);
+    }
+    SoftmaxAttributes attr;
+    attr.axis = Axis::CHANNELS;  // always by channels
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
+
+class SpaceToDepthOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    // TODO(impjdi): Dims check.
+    TfLiteSpaceToDepthParams* s2d_params = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &s2d_params));
+    if (s2d_params->block_size == 1) {
+      return InvalidArgumentError("SPACE_TO_DEPTH block_size = 1 is a no-op.");
+    }
+    if (s2d_params->block_size < 1) {
+      return InvalidArgumentError("SPACE_TO_DEPTH block_size must be > 1.");
+    }
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SPACE_TO_DEPTH);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    const auto* tf_options = reinterpret_cast<const TfLiteSpaceToDepthParams*>(
+        tflite_node->builtin_data);
+    SpaceToDepthAttributes attr;
+    attr.block_size = tf_options->block_size;
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
+
 class StridedSliceOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -2651,12 +2688,12 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
           OperationType::RSQRT);
     case kTfLiteBuiltinSin:
       return absl::make_unique<ElementwiseOperationParser>(OperationType::SIN);
-    case kTfLiteBuiltinSoftmax:
-      return absl::make_unique<SoftmaxOperationParser>();
     case kTfLiteBuiltinSlice:
       return absl::make_unique<SliceOperationParser>();
-    case kTfLiteBuiltinStridedSlice:
-      return absl::make_unique<StridedSliceOperationParser>();
+    case kTfLiteBuiltinSoftmax:
+      return absl::make_unique<SoftmaxOperationParser>();
+    case kTfLiteBuiltinSpaceToDepth:
+      return absl::make_unique<SpaceToDepthOperationParser>();
     case kTfLiteBuiltinSqrt:
       return absl::make_unique<ElementwiseOperationParser>(OperationType::SQRT);
     case kTfLiteBuiltinSquare:
@@ -2665,6 +2702,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
     case kTfLiteBuiltinSquaredDifference:
       return absl::make_unique<ElementwiseOperationParser>(
           OperationType::SQUARED_DIFF);
+    case kTfLiteBuiltinStridedSlice:
+      return absl::make_unique<StridedSliceOperationParser>();
     case kTfLiteBuiltinSub:
       return absl::make_unique<ElementwiseOperationParser>(OperationType::SUB);
     case kTfLiteBuiltinTanh:
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 0d5c3429a49..fa5cdc54047 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -90,6 +90,8 @@ std::string ToString(enum OperationType op) {
       return "depthwise_convolution";
     case OperationType::DIV:
       return "div";
+    case OperationType::EXP:
+      return "exp";
     case OperationType::FULLY_CONNECTED:
       return "fully_connected";
     case OperationType::HARD_SWISH:
@@ -134,6 +136,8 @@ std::string ToString(enum OperationType op) {
       return "softmax";
     case OperationType::SPACE_TO_BATCH:
       return "space_to_batch";
+    case OperationType::SPACE_TO_DEPTH:
+      return "space_to_depth";
     case OperationType::SQRT:
       return "sqrt";
     case OperationType::SQUARE:
@@ -165,6 +169,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"cos", OperationType::COS},
           {"depthwise_convolution", OperationType::DEPTHWISE_CONVOLUTION},
           {"div", OperationType::DIV},
+          {"exp", OperationType::EXP},
           {"fully_connected", OperationType::FULLY_CONNECTED},
           {"hard_swish", OperationType::HARD_SWISH},
           {"log", OperationType::LOG},
@@ -186,6 +191,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"sin", OperationType::SIN},
           {"slice", OperationType::SLICE},
           {"softmax", OperationType::SOFTMAX},
+          {"space_to_depth", OperationType::SPACE_TO_DEPTH},
           {"sqrt", OperationType::SQRT},
           {"square", OperationType::SQUARE},
           {"squared_diff", OperationType::SQUARED_DIFF},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 87bb3ec383f..c5be9897fed 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -43,6 +43,7 @@ enum class OperationType {
   COS,
   DEPTHWISE_CONVOLUTION,
   DIV,
+  EXP,
   FULLY_CONNECTED,
   HARD_SWISH,
   LOG,
@@ -65,6 +66,7 @@ enum class OperationType {
   SLICE,
   SOFTMAX,
   SPACE_TO_BATCH,
+  SPACE_TO_DEPTH,
   SQRT,
   SQUARE,
   SQUARED_DIFF,
@@ -472,6 +474,10 @@ struct TransposeAttributes {
 // the given input.
 BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr);
 
+struct SpaceToDepthAttributes {
+  int block_size;
+};
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 755141fbb37..68ae9dfd4dc 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -569,6 +569,35 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "space_to_depth",
+    srcs = ["space_to_depth.cc"],
+    hdrs = ["space_to_depth.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
+cc_test(
+    name = "space_to_depth_test",
+    srcs = ["space_to_depth_test.cc"],
+    tags = tf_gpu_tests_tags() + [
+        "notap",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":space_to_depth",
+        ":test_util",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "test_util",
     testonly = 1,
@@ -676,6 +705,7 @@ TFLITE_GPU_BINARY_RELEASE_OPERATORS = [
     "resize",
     "slice",
     "softmax",
+    "space_to_depth",
     "transpose_conv",
 ]
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index 7ba2dd871e7..941a32a8769 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -40,6 +40,9 @@ class ElementwiseOneArgument : public NodeShader {
       case OperationType::COS:
         source = "value_0 = cos(value_0);";
         break;
+      case OperationType::EXP:
+        source = "value_0 = exp(value_0);";
+        break;
       case OperationType::HARD_SWISH:
         source =
             "value_0 *= clamp(value_0 / 6.0 + vec4(0.5), vec4(0.0), "
@@ -258,11 +261,13 @@ class ElementwiseTwoArguments : public NodeShader {
     if (IsSupportedBroadcast(ctx)) {
       return ImplementElementwiseBroadcast(ctx, generated_code);
     }
-    auto attr =
-        absl::any_cast<ElementwiseAttributes>(ctx.node->operation.attributes);
-    auto scalar = absl::get_if<float>(&attr.param);
-    if (scalar) {
-      return ImplementElementwiseWithScalar(ctx, *scalar, generated_code);
+    const ElementwiseAttributes* attr =
+        absl::any_cast<ElementwiseAttributes>(&ctx.node->operation.attributes);
+    if (attr) {
+      auto scalar = absl::get_if<float>(&attr->param);
+      if (scalar) {
+        return ImplementElementwiseWithScalar(ctx, *scalar, generated_code);
+      }
     }
     return InvalidArgumentError(
         "This case is not supported by elementwise with two arguments "
@@ -280,6 +285,7 @@ std::unique_ptr<NodeShader> NewElementwiseNodeShader(
   switch (operation_type) {
     case OperationType::ABS:
     case OperationType::COS:
+    case OperationType::EXP:
     case OperationType::LOG:
     case OperationType::HARD_SWISH:
     case OperationType::RSQRT:
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
index e597cc898e9..3316395f5e3 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
@@ -74,6 +74,22 @@ TEST(ElementwiseTest, Div) {
               Pointwise(FloatNear(1e-6), {0.0, -3.1, -4.0, 1.0}));
 }
 
+TEST(ElementwiseTest, Exp) {
+  OperationType op_type = OperationType::EXP;
+  const BHWC shape(1, 1, 1, 7);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  ASSERT_TRUE(model.PopulateTensor(
+      0, {0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6),
+                        {std::exp(0.0f), std::exp(1.0f), std::exp(-1.0f),
+                         std::exp(100.0f), std::exp(-100.0f), std::exp(0.01f),
+                         std::exp(-0.01f)}));
+}
+
 TEST(ElementwiseTest, HardSwish) {
   OperationType op_type = OperationType::HARD_SWISH;
   const BHWC shape(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 924f7dbf1ec..cb4bed369dc 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -94,6 +94,7 @@ class Registry : public NodeShader {
     insert_elementwise_op(Type::ABS);
     insert_elementwise_op(Type::COS);
     insert_elementwise_op(Type::DIV);
+    insert_elementwise_op(Type::EXP);
     insert_elementwise_op(Type::HARD_SWISH);
     insert_elementwise_op(Type::LOG);
     insert_elementwise_op(Type::MAXIMUM);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
new file mode 100644
index 00000000000..1d49da0e3fa
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+class SpaceToDepth : public NodeShader {
+ public:
+  Status GenerateCode(const GenerationContext& ctx,
+                      GeneratedCode* generated_code) const final {
+    const auto attr =
+        absl::any_cast<SpaceToDepthAttributes>(ctx.node->operation.attributes);
+    const auto& input_data_0 = ctx.graph->FindInputs(ctx.node->id)[0]->tensor;
+    std::string code = R"(
+      for (int i = 0; i < 4; ++i) {
+        int dst_c = 4 * gid.z + i;
+        int block_id = dst_c / $input_data_0_c$;
+        int src_x = gid.x * $block_size$ + block_id % $block_size$;
+        int src_y = gid.y * $block_size$ + block_id / $block_size$;
+        int src_c = dst_c % $input_data_0_c$;
+        value_0[i] = $input_data_0[src_x, src_y, src_c / 4]$[src_c % 4];
+      }
+    )";
+
+    *generated_code = {
+        /*parameters=*/{
+            {"block_size", attr.block_size},
+            {"input_data_0_c", input_data_0.shape.c},
+        },
+        /*objects=*/{},
+        /*shared_variables=*/{},
+        /*workload=*/uint3(),
+        /*workgroup=*/uint3(),
+        /*source_code=*/std::move(code),
+        /*input=*/IOStructure::ONLY_DEFINITIONS,
+        /*output=*/IOStructure::AUTO,
+    };
+    return OkStatus();
+  }
+};
+}  // namespace
+
+std::unique_ptr<NodeShader> NewSpaceToDepthNodeShader() {
+  return absl::make_unique<SpaceToDepth>();
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
old mode 100755
new mode 100644
similarity index 54%
rename from third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
rename to tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
index 72a7cf77346..3c52ef4eba7
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
 
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
+#include <memory>
 
-#define TF_CUDA_VERSION "10.0"
-#define TF_CUDA_LIB_VERSION "10.0"
-#define TF_CUDNN_VERSION "7"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
 
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
+namespace tflite {
+namespace gpu {
+namespace gl {
 
-#endif  // CUDA_CUDA_CONFIG_H_
+std::unique_ptr<NodeShader> NewSpaceToDepthNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc
new file mode 100644
index 00000000000..0ff132b8147
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/test_util.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+TEST(SpaceToDepthTest, TensorShape1x2x2x1BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 1), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0f, 2.0f, 3.0f, 4.0f}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {1.0f, 2.0f, 3.0f, 4.0f}));
+}
+
+TEST(SpaceToDepthTest, TensorShape1x2x2x2BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 2), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 8), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(
+      0, {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6),
+                        {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f}));
+}
+
+TEST(SpaceToDepthTest, TensorShape1x2x2x3BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 3), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 12), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0f, 2.0f, 3.0f,  //
+                                       4.0f, 5.0f, 6.0f,  //
+                                       7.0f, 8.0f, 9.0f,  //
+                                       10.0f, 11.0f, 12.0f}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(
+      model.GetOutput(0),
+      Pointwise(FloatNear(1e-6), {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,  //
+                                  7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}));
+}
+
+TEST(SpaceToDepthTest, TensorShape1x4x4x1BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 4, 4, 1), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0, 2.0, 5.0, 6.0,     //
+                                       3.0, 4.0, 7.0, 8.0,     //
+                                       9.0, 10.0, 13.0, 14.0,  //
+                                       11.0, 12.0, 15.0, 16.0}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {1.0, 2.0, 3.0, 4.0,     //
+                                          5.0, 6.0, 7.0, 8.0,     //
+                                          9.0, 10.0, 11.0, 12.0,  //
+                                          13.0, 14.0, 15.0, 16.0}));
+}
+}  // namespace
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 3b05a7d4386..b3fbf179293 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -1,5 +1,10 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_ios_per_kernel_test", "tflite_portable_test_suite")
+load(
+    "//tensorflow/lite:special_rules.bzl",
+    "tflite_ios_lab_runner",
+    "tflite_ios_per_kernel_test",
+    "tflite_portable_test_suite",
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
@@ -75,6 +80,7 @@ ios_unit_test(
     name = "common_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -111,6 +117,7 @@ ios_unit_test(
     name = "compiled_model_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -179,6 +186,7 @@ ios_unit_test(
     name = "environment_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -219,6 +227,7 @@ ios_unit_test(
     name = "inference_context_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -281,6 +290,7 @@ objc_library(
 ios_unit_test(
     name = "ComponentsTests",
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + ["notap"],
     test_host = ":TestApplication",
     deps = [
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 802697ee9a9..ee98c403952 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/resize.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/slice.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/softmax.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
@@ -137,6 +138,12 @@ std::vector<ComputeTaskDescriptorPtr> SelectSoftmax(const GraphFloat32& graph,
   }
 }
 
+std::vector<ComputeTaskDescriptorPtr> SelectSpaceToDepth(
+    const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
+    const SpaceToDepthAttributes& attr) {
+  return SpaceToDepth(id, input_id, output_id, attr);
+}
+
 Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
                           const std::vector<ValueId>& inputs,
                           const std::vector<ValueId>& outputs,
@@ -254,8 +261,14 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
       *tasks = SelectSoftmax(graph, node_id, inputs[0], outputs[0]);
       break;
     }
+    case OperationType::SPACE_TO_DEPTH:
+      *tasks = SelectSpaceToDepth(
+          graph, node_id, inputs[0], outputs[0],
+          absl::any_cast<SpaceToDepthAttributes>(node->operation.attributes));
+      break;
     case OperationType::ABS:
     case OperationType::COS:
+    case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
     case OperationType::RSQRT:
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index ada4b7c04ed..586484beb00 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -1,5 +1,10 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_ios_per_kernel_test", "tflite_portable_test_suite")
+load(
+    "//tensorflow/lite:special_rules.bzl",
+    "tflite_ios_lab_runner",
+    "tflite_ios_per_kernel_test",
+    "tflite_portable_test_suite",
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
@@ -30,6 +35,7 @@ cc_library(
         ":resize",
         ":slice",
         ":softmax",
+        ":space_to_depth",
         ":transpose_conv",
     ],
 )
@@ -64,6 +70,7 @@ ios_unit_test(
     name = "add_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -101,6 +108,7 @@ ios_unit_test(
     name = "concat_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -139,6 +147,7 @@ ios_unit_test(
     name = "conv_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -191,6 +200,7 @@ ios_unit_test(
     name = "depthwise_conv_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -230,6 +240,7 @@ ios_unit_test(
     name = "elementwise_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -269,6 +280,7 @@ ios_unit_test(
     name = "fully_connected_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -307,6 +319,7 @@ ios_unit_test(
     name = "max_unpooling_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -345,6 +358,7 @@ ios_unit_test(
     name = "mean_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = [
         "notap",
         "tflite_not_portable_android",
@@ -384,6 +398,7 @@ ios_unit_test(
     name = "mul_test",
     testonly = 1,
     minimum_os_version = "9.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -422,6 +437,7 @@ ios_unit_test(
     name = "padding_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -461,6 +477,7 @@ ios_unit_test(
     name = "pooling_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -500,6 +517,7 @@ ios_unit_test(
     name = "prelu_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -538,6 +556,7 @@ ios_unit_test(
     name = "relu_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -576,6 +595,7 @@ ios_unit_test(
     name = "resize_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -615,6 +635,7 @@ ios_unit_test(
     name = "reshape_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -653,6 +674,7 @@ ios_unit_test(
     name = "slice_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -691,6 +713,7 @@ ios_unit_test(
     name = "softmax_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
@@ -698,6 +721,42 @@ ios_unit_test(
     deps = [":softmax_test_lib"],
 )
 
+cc_library(
+    name = "space_to_depth",
+    srcs = ["space_to_depth.cc"],
+    hdrs = ["space_to_depth.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
+        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
+        "//tensorflow/lite/delegates/gpu/metal/kernels:util",
+    ],
+)
+
+objc_library(
+    name = "space_to_depth_test_lib",
+    testonly = 1,
+    srcs = ["space_to_depth_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":space_to_depth",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "space_to_depth_test",
+    testonly = 1,
+    minimum_os_version = "10.0",
+    tags = tf_gpu_tests_tags() + [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":space_to_depth_test_lib"],
+)
+
 cc_library(
     name = "transpose_conv",
     srcs = ["transpose_conv.cc"],
@@ -730,6 +789,7 @@ ios_unit_test(
     name = "transpose_conv_test",
     testonly = 1,
     minimum_os_version = "10.0",
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_android",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
index 18430f8e71f..7fdfd3257ea 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
@@ -153,6 +153,7 @@ std::vector<ComputeTaskDescriptorPtr> ElementwiseWithOneInput(
       {OperationType::HARD_SWISH,
        "value * clamp(value / 6.0f + FLT4(0.5f), FLT4(0.0f), FLT4(1.0f))"},
       {OperationType::COS, "cos(value)"},
+      {OperationType::EXP, "exp(value)"},
       {OperationType::LOG, "log(value)"},
       {OperationType::SQRT, "sqrt(value)"},
       {OperationType::RSQRT, "1.0 / sqrt(value)"},
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
index c70fd7368de..4baa4573909 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -91,6 +91,21 @@ TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
   XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
 }
 
+- (void)testExp {
+  OperationType op_type = OperationType::EXP;
+  const BHWC shape(1, 1, 1, 5);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0f, 1.0f, -1.0f, 100.0f, -100.0f, 0.01f, -0.01f}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+  status = CompareVectors({std::exp(0.0f), std::exp(1.0f), std::exp(-1.0f), std::exp(100.0f),
+                           std::exp(-100.0f), std::exp(0.01f), std::exp(-0.01f)},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+}
+
 - (void)testHardSwish {
   OperationType op_type = OperationType::HARD_SWISH;
   const BHWC shape(1, 1, 1, 7);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
new file mode 100644
index 00000000000..3614174ef11
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+std::vector<ComputeTaskDescriptorPtr> SpaceToDepth(
+    int id, ValueId input_id, ValueId output_id,
+    const SpaceToDepthAttributes& attr) {
+  auto desc = std::make_shared<ComputeTaskDescriptor>();
+  desc->id = id;
+  desc->is_linkable = false;
+  desc->shader_source = R"(
+#include <metal_stdlib>
+using namespace metal;
+struct uniforms {
+  uint4 src_size;
+  uint4 dst_size;
+  uint4 block_size;
+};
+$0
+kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
+  uint3 src_size = (uint3)(params.src_size.xyz);
+  uint3 dst_size = (uint3)(params.dst_size.xyz);
+  uint block_size = (uint)(params.block_size.x);
+  if (gid.x >= dst_size.x || gid.y >= dst_size.y || gid.z * 4 >= dst_size.z) {
+    return;
+  }
+  FLT4 value;
+  for (uint i = 0; i < 4; ++i) {
+    uint dst_c = 4 * gid.z + i;
+    uint block_id = dst_c / src_size.z;
+    uint src_x = gid.x * block_size + block_id % block_size;
+    uint src_y = gid.y * block_size + block_id / block_size;
+    uint src_c = dst_c % src_size.z;
+    value[i] =
+        src_buffer[src_x + src_size.x * (src_y + src_size.y * (src_c / 4))]
+                  [src_c % 4];
+  }
+  $2
+  dst_buffer[gid.x + dst_size.x * (gid.y + dst_size.y * gid.z)] = value;
+})";
+
+  desc->input_buffers = {{input_id, "device FLT4* const src_buffer"}};
+
+  desc->output_buffer = {
+      output_id, "device FLT4* dst_buffer",
+      [input_id, attr](const std::map<ValueId, BHWC>& buffers) -> BHWC {
+        const BHWC& input_shape = buffers.find(input_id)->second;
+        return BHWC(input_shape.b,  //
+                    input_shape.h / attr.block_size,
+                    input_shape.w / attr.block_size,
+                    input_shape.c * attr.block_size * attr.block_size);
+      }};
+
+  desc->uniform_buffers = {
+      {"constant uniforms& params",
+       [input_id, output_id, attr](const std::map<ValueId, BHWC>& buffers) {
+         const BHWC& input_shape = buffers.find(input_id)->second;
+         const BHWC& output_shape = buffers.find(output_id)->second;
+         const std::vector<int> uniform_params = {
+             // src_size
+             input_shape.w,
+             input_shape.h,
+             input_shape.c,
+             0,
+             // dst_size
+             output_shape.w,
+             output_shape.h,
+             output_shape.c,
+             0,
+             // block_size
+             attr.block_size,
+             0,
+             0,
+             0,
+         };
+         return GetByteBuffer(uniform_params);
+       }},
+  };
+
+  desc->resize_function =
+      [input_id, attr](
+          const std::map<ValueId, BHWC>& buffers) -> std::pair<uint3, uint3> {
+    const BHWC& input_shape = buffers.find(input_id)->second;
+    const BHWC output_shape(input_shape.b,  //
+                            input_shape.h / attr.block_size,
+                            input_shape.w / attr.block_size,
+                            input_shape.c * attr.block_size * attr.block_size);
+    const uint3 grid = uint3(output_shape.w, output_shape.h,
+                             IntegralDivideRoundUp(output_shape.c, 4));
+    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
+    const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
+  };
+  return {desc};
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h
new file mode 100644
index 00000000000..c46a2dfbaab
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_SPACE_TO_DEPTH_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+std::vector<ComputeTaskDescriptorPtr> SpaceToDepth(
+    int id, ValueId input_id, ValueId output_id,
+    const SpaceToDepthAttributes& attr);
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
new file mode 100644
index 00000000000..6e82ebe0361
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
+
+#import <XCTest/XCTest.h>
+
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::OperationType;
+using ::tflite::gpu::SpaceToDepthAttributes;
+using ::tflite::gpu::TensorRef;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+
+@interface SpaceToDepthTest : XCTestCase
+@end
+
+@implementation SpaceToDepthTest
+
+- (void)testTensorShape1x2x2x1BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 1), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.0f, 2.0f, 3.0f, 4.0f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.0f, 2.0f, 3.0f, 4.0f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+}
+
+- (void)testTensorShape1x2x2x2BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 2), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 8), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+  XCTAssertEqual(actual[4], expected[4]);
+  XCTAssertEqual(actual[5], expected[5]);
+  XCTAssertEqual(actual[6], expected[6]);
+  XCTAssertEqual(actual[7], expected[7]);
+}
+
+- (void)testTensorShape1x2x2x3BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 3), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 12), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.0f, 2.0f, 3.0f,  //
+                                4.0f, 5.0f, 6.0f,  //
+                                7.0f, 8.0f, 9.0f,  //
+                                10.0f, 11.0f, 12.0f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.0f,  2.0f,  3.0f,  //
+                                       4.0f,  5.0f,  6.0f,  //
+                                       7.0f,  8.0f,  9.0f,  //
+                                       10.0f, 11.0f, 12.0f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+  XCTAssertEqual(actual[4], expected[4]);
+  XCTAssertEqual(actual[5], expected[5]);
+  XCTAssertEqual(actual[6], expected[6]);
+  XCTAssertEqual(actual[7], expected[7]);
+  XCTAssertEqual(actual[8], expected[8]);
+  XCTAssertEqual(actual[9], expected[9]);
+  XCTAssertEqual(actual[10], expected[10]);
+  XCTAssertEqual(actual[11], expected[11]);
+}
+
+- (void)testTensorShape1x4x4x1BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 4, 4, 1), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.0f, 2.0f, 5.0f, 6.0f,     //
+                                3.0f, 4.0f, 7.0f, 8.0f,     //
+                                9.0f, 10.0f, 13.0f, 14.0f,  //
+                                11.0f, 12.0f, 15.0f, 16.0f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.0f,  2.0f,  3.0f,  4.0f,   //
+                                       5.0f,  6.0f,  7.0f,  8.0f,   //
+                                       9.0f,  10.0f, 11.0f, 12.0f,  //
+                                       13.0f, 14.0f, 15.0f, 16.0f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+  XCTAssertEqual(actual[4], expected[4]);
+  XCTAssertEqual(actual[5], expected[5]);
+  XCTAssertEqual(actual[6], expected[6]);
+  XCTAssertEqual(actual[7], expected[7]);
+  XCTAssertEqual(actual[8], expected[8]);
+  XCTAssertEqual(actual[9], expected[9]);
+  XCTAssertEqual(actual[10], expected[10]);
+  XCTAssertEqual(actual[11], expected[11]);
+  XCTAssertEqual(actual[12], expected[12]);
+  XCTAssertEqual(actual[13], expected[13]);
+  XCTAssertEqual(actual[14], expected[14]);
+  XCTAssertEqual(actual[15], expected[15]);
+}
+
+@end
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index ee47ad0e24d..021f74b6ab0 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -179,6 +179,7 @@ cc_test(
         ":nnapi_delegate",
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:test_util",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index a3a4babd91f..84b3cfb2f15 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4185,25 +4185,29 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       .version = 1,
   };
 
-  std::vector<int>& nodes_to_delegate = supported_nodes;
+  std::vector<int> nodes_to_delegate;
+
+  int num_partitions;
+  TfLiteDelegateParams* params_array;
   if (is_accelerator_specified) {
-    std::vector<int> device_supported_nodes;
-    int num_partitions;
-    TfLiteDelegateParams* params_array;
-
+    // Filtering out nodes not supported by target accelerators
     TF_LITE_ENSURE_STATUS(GetNodesSupportedByAccelerator(
-        context, delegate, nnapi, supported_nodes, &device_supported_nodes,
+        context, delegate, nnapi, supported_nodes, &nodes_to_delegate,
         &num_partitions, &params_array, nnapi_errno));
-
-    TF_LITE_ENSURE_STATUS(LimitDelegatedPartitions(
-        delegate_options.max_number_delegated_partitions,
-        std::vector<TfLiteDelegateParams>(params_array,
-                                          params_array + num_partitions),
-        &device_supported_nodes));
-
-    nodes_to_delegate = device_supported_nodes;
+  } else {
+    nodes_to_delegate = supported_nodes;
+    auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
+    TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+        context, supported_nodes_int_array.get(), &params_array,
+        &num_partitions));
   }
 
+  TF_LITE_ENSURE_STATUS(
+      LimitDelegatedPartitions(delegate_options.max_number_delegated_partitions,
+                               std::vector<TfLiteDelegateParams>(
+                                   params_array, params_array + num_partitions),
+                               &nodes_to_delegate));
+
   if (nodes_to_delegate.empty()) {
     return kTfLiteOk;
   } else {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index bf9e00bee69..d6183e63013 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <sys/mman.h>
 
 #include <algorithm>
+#include <array>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -23,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
@@ -545,13 +547,39 @@ TEST_F(UnsupportedOperationOnDeviceTest, ShouldCacheModelCompilation) {
 }
 
 // Model with a chain of no-op (add with zero operations)
+// interleaved with no-op custom nodes.
 class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
  public:
   LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const std::unordered_set<int>& custom_nodes_indexes,
                     const NnApi* nnapi, const std::string& accelerator_name,
                     int max_nnapi_partitions)
       : MultiOpModel(),
         AcceleratedModel(nnapi, accelerator_name, max_nnapi_partitions) {
+    Init(input_shape, graph_size, custom_nodes_indexes);
+  }
+
+  LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const std::unordered_set<int>& custom_nodes_indexes,
+                    const NnApi* nnapi, int max_nnapi_partitions)
+      : MultiOpModel(), AcceleratedModel(nnapi, false, max_nnapi_partitions) {
+    Init(input_shape, graph_size, custom_nodes_indexes);
+  }
+
+  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+
+  int CountNnApiPartitions() {
+    return std::count_if(
+        std::begin(interpreter_->execution_plan()),
+        std::end(interpreter_->execution_plan()), [this](const int node_index) {
+          return interpreter_->node_and_registration(node_index)
+                     ->first.delegate != nullptr;
+        });
+  }
+
+ private:
+  void Init(const std::vector<int>& input_shape, int graph_size,
+            const std::unordered_set<int>& custom_nodes_indexes) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
@@ -574,10 +602,15 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
                  {intermediate_outputs[0]});
 
     for (int i = 0; i < intermediate_outputs.size() - 1; i++) {
-      AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
-                   CreateAddOptions(builder_).Union(),
-                   {intermediate_outputs[i], zero_input_},
-                   {intermediate_outputs[i + 1]});
+      if (custom_nodes_indexes.count(i + 1) == 1) {
+        AddCustomOp("custom_no_op", {}, [this]() { return CustomNoOpNode(); },
+                    {intermediate_outputs[i]}, {intermediate_outputs[i + 1]});
+      } else {
+        AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                     CreateAddOptions(builder_).Union(),
+                     {intermediate_outputs[i], zero_input_},
+                     {intermediate_outputs[i + 1]});
+      }
     }
 
     AddBuiltinOp(
@@ -592,18 +625,42 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
     PopulateTensor(zero_input_, zero);
   }
 
-  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+  // Return the registration of a custom node simply copying input to output.
+  TfLiteRegistration* CustomNoOpNode() {
+    static TfLiteRegistration no_op = {
+        .init = [](TfLiteContext* context, const char* buffer,
+                   size_t length) -> void* { return nullptr; },
 
-  int CountNnApiPartitions() {
-    return std::count_if(
-        std::begin(interpreter_->execution_plan()),
-        std::end(interpreter_->execution_plan()), [this](const int node_index) {
-          return interpreter_->node_and_registration(node_index)
-                     ->first.delegate != nullptr;
-        });
+        .free = [](TfLiteContext* context, void* buffer) -> void {},
+
+        .prepare = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+          if (node->inputs->size != 1 || node->outputs->size != 1) {
+            return kTfLiteError;
+          }
+
+          return kTfLiteOk;
+        },
+
+        .invoke = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+          auto input_tensor = context->tensors[node->inputs->data[0]];
+          auto output_tensor = context->tensors[node->outputs->data[0]];
+
+          std::copy(input_tensor.data.raw,
+                    input_tensor.data.raw + input_tensor.bytes,
+                    output_tensor.data.raw);
+
+          return kTfLiteOk;
+        },
+
+        .profiling_string = nullptr,
+        .builtin_code = kTfLiteBuiltinDelegate,
+        .custom_name = "NoOpTestDelegate",
+        .version = 1,
+    };
+
+    return &no_op;
   }
-
- private:
   int input_;
   int zero_input_;
   int output_;
@@ -643,7 +700,8 @@ class DelegatePartitionLimitTest
   // input_shape.
   void Init(int max_nnapi_partitions,
             const std::vector<int>& nnapi_partition_sizes,
-            const std::vector<int>& input_shape) {
+            const std::vector<int>& input_shape,
+            bool specify_accelerator = true) {
     // The graph will have as number of nodes the sum of nodes in the NNAPI
     // partitions plus nnapi_partition_sizes.size() - 1 nodes that will be
     // not supported by NNAPI and will cause the
@@ -658,20 +716,36 @@ class DelegatePartitionLimitTest
       unsupported_ops_idxs.insert(partition_node_idx);
     }
 
-    DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
-        graph_size_, unsupported_ops_idxs);
+    if (specify_accelerator) {
+      // Building a model that will contain initially a single partition
+      // and will get then partitioned by checking the operations supported
+      // by the target accelerator.
+      // This because I am not able to know the size of each partition in my
+      // stubbed GetSupportedOperationsForDevices API.
+      DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
+          graph_size_, unsupported_ops_idxs);
 
-    nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
-        [](const ANeuralNetworksModel* model,
-           const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
-           bool* supported_ops) -> int {
-          DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(supported_ops);
-          return ANEURALNETWORKS_NO_ERROR;
-        });
+      nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
+          [](const ANeuralNetworksModel* model,
+             const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
+             bool* supported_ops) -> int {
+            DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(
+                supported_ops);
+            return ANEURALNETWORKS_NO_ERROR;
+          });
 
-    model_ = std::make_unique<LongIdentityModel>(
-        input_shape, graph_size_, nnapi_mock_->GetNnApi(),
-        /*accelerator_name=*/"test-device", max_nnapi_partitions);
+      model_ = std::make_unique<LongIdentityModel>(
+          input_shape, graph_size_,
+          /*custom_nodes_indexes=*/std::unordered_set<int>(),
+          nnapi_mock_->GetNnApi(),
+          /*accelerator_name=*/"test-device", max_nnapi_partitions);
+    } else {
+      // Building a model containing custom nodes that won't be supported
+      // by the delegate and generate the partitions.
+      model_ = std::make_unique<LongIdentityModel>(
+          input_shape, graph_size_, unsupported_ops_idxs,
+          nnapi_mock_->GetNnApi(), max_nnapi_partitions);
+    }
   }
 
   std::unique_ptr<LongIdentityModel> model_;
@@ -718,24 +792,44 @@ TEST_F(DelegatePartitionLimitTest,
 }
 
 TEST_F(DelegatePartitionLimitTest, ShouldDelegatePartitionWithHigherNodeCount) {
+  int kLargestModelSize = 3;
   Init(/*max_nnapi_partitions=*/1,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});
 
   EXPECT_EQ(model_->CountNnApiPartitions(), 1);
-  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 3);
+  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(),
+            OriginalGraphSize() - kLargestModelSize);
 }
 
 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegatePartitionsWithHigherNodeCount) {
+  int kLargestModelSize = 5;
+  int kSecondLargestModelSize = 4;
   Init(/*max_nnapi_partitions=*/2,
-       /*nnapi_partition_sizes=*/{1, 5, 2, 4},
+       /*nnapi_partition_sizes=*/
+       {1, kLargestModelSize, 2, kSecondLargestModelSize},
        /*input_shape=*/{1, 2, 2, 1});
 
   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
   EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 9);
 }
 
+TEST_F(DelegatePartitionLimitTest,
+       ShouldLimitPartitionsEvenWithoutAcceleratorNameSpecified) {
+  int kLargestModelSize = 5;
+  int kSecondLargestModelSize = 4;
+  Init(/*max_nnapi_partitions=*/2,
+       /*nnapi_partition_sizes=*/
+       {1, kLargestModelSize, 2, kSecondLargestModelSize},
+       /*input_shape=*/{1, 2, 2, 1}, /*specify_accelerator=*/false);
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+  EXPECT_EQ(
+      model_->CountOpsExecutedByCpuKernel(),
+      OriginalGraphSize() - (kLargestModelSize + kSecondLargestModelSize));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index e6574cb7d3b..8f52d0a0be0 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -1022,7 +1022,7 @@ class Subgraph {
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteSoftmaxParams* params,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    if (params->beta == 1.0f) {
+    if (params->beta != 1.0f) {
       if (logging_context != nullptr) {
         logging_context->ReportError(
             logging_context, "unsupported beta value %.7f in SOFTMAX node #%d",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc
index 93ff6692a0a..809d6e7d7dc 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc
@@ -135,6 +135,7 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Gather information about the Convolution operations.
   TfLitePadding padding_type = kTfLitePaddingUnknown;
+  TfLiteFusedActivation activation = kTfLiteActNone;
   int stride_height = 0;
   int stride_width = 0;
   bool is_dilated_depthwise_conv = false;
@@ -144,12 +145,14 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     stride_height = conv_params->stride_height;
     stride_width = conv_params->stride_width;
     padding_type = conv_params->padding;
+    activation = conv_params->activation;
   } else if (op_node_.op_type == OP_DepthwiseSupernode_8x8p32to8) {
     const TfLiteDepthwiseConvParams* conv_params =
         reinterpret_cast<const TfLiteDepthwiseConvParams*>(builtin_data_);
     stride_height = conv_params->stride_height;
     stride_width = conv_params->stride_width;
     padding_type = conv_params->padding;
+    activation = conv_params->activation;
     // We only support dilation for DepthwiseConv.
     if (conv_params->dilation_height_factor > 1 ||
         conv_params->dilation_width_factor > 1) {
@@ -214,17 +217,32 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
           &output_depth_size, context->tensors[outputs->data[0]].dims);
-  // Output min/max.
+  // Output bounds.
   // TODO(b/129276536): Add support for other activations here. Current
   // implementation assumes None/Relu.
   TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
       context->tensors[outputs->data[0]], &output_min_, &output_max_,
       std::numeric_limits<uint8_t>::min(),
       std::numeric_limits<uint8_t>::max()));
-  auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&output_min_, sizeof(output_min_));
-  auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape.data(), (char*)&output_max_, sizeof(output_max_));
+  // These denote the bounds fed to Hexagon's Conv mechanism, which will be
+  // different from the TFLite tensor bounds if there is a RELU activation.
+  float conv_output_min = output_min_;
+  float conv_output_max = output_max_;
+  if (activation == kTfLiteActRelu6) {
+    conv_output_min = 0;
+    conv_output_max = 6;
+  } else if (activation == kTfLiteActRelu1) {
+    conv_output_min = 0;
+    conv_output_max = 1;
+  } else if (activation == kTfLiteActRelu) {
+    conv_output_min = 0;
+  }
+  auto* conv_output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape.data(), reinterpret_cast<char*>(&conv_output_min),
+      sizeof(conv_output_min));
+  auto* conv_output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape.data(), reinterpret_cast<char*>(&conv_output_max),
+      sizeof(conv_output_max));
 
   // Bias node.
   const auto& bias_tensor = context->tensors[inputs->data[2]];
@@ -238,6 +256,7 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   auto* bias_max_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape.data(), (char*)&bias_max_, sizeof(bias_max_));
 
+  TensorID output, output_min, output_max;
   if (is_dilated_depthwise_conv) {
     // For dilated Depthwise Conv, we convert this node into SpaceToBatchND, and
     // then chain Supernode & BatchToSpaceND after it.
@@ -288,8 +307,8 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     conv_op->AddInput(TensorID(bias_data_node->GetID(), 0));
     conv_op->AddInput(TensorID(bias_min_const->GetID(), 0));
     conv_op->AddInput(TensorID(bias_max_const->GetID(), 0));
-    conv_op->AddInput(TensorID(output_min_const->GetID(), 0));
-    conv_op->AddInput(TensorID(output_max_const->GetID(), 0));
+    conv_op->AddInput(TensorID(conv_output_min_const->GetID(), 0));
+    conv_op->AddInput(TensorID(conv_output_max_const->GetID(), 0));
     // The padding is handled by the SpaceToBatch/BatchToSpace ops surrounding
     // this node. Hence, this op's padding remains VALID only.
     // tf.nn.with_space_to_batch's docs state the following pattern:
@@ -319,14 +338,14 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     batch_to_space_op->AddInput(conv_output);
     batch_to_space_op->AddInput(TensorID(dilation_factors_const->GetID(), 0));
     batch_to_space_op->AddInput(TensorID(crops_const->GetID(), 0));
-    batch_to_space_op->AddInput(TensorID(output_min_const->GetID(), 0));
-    batch_to_space_op->AddInput(TensorID(output_max_const->GetID(), 0));
-    node_output_ =
+    batch_to_space_op->AddInput(TensorID(conv_output_min_const->GetID(), 0));
+    batch_to_space_op->AddInput(TensorID(conv_output_max_const->GetID(), 0));
+    output =
         batch_to_space_op->AddOutput(sizeof(uint8_t), 4,
                                      {output_batch_size, output_height_size,
                                       output_width_size, output_depth_size});
-    batch_to_space_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    batch_to_space_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    output_min = batch_to_space_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    output_max = batch_to_space_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
   } else {
     // Standard case.
     // Padding type.
@@ -346,14 +365,39 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     AddInput(TensorID(bias_data_node->GetID(), 0));
     AddInput(TensorID(bias_min_const->GetID(), 0));
     AddInput(TensorID(bias_max_const->GetID(), 0));
-    AddInput(TensorID(output_min_const->GetID(), 0));
-    AddInput(TensorID(output_max_const->GetID(), 0));
+    AddInput(TensorID(conv_output_min_const->GetID(), 0));
+    AddInput(TensorID(conv_output_max_const->GetID(), 0));
     // Outputs
-    node_output_ = AddOutput(sizeof(uint8_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    output = AddOutput(sizeof(uint8_t), 4,
+                       {output_batch_size, output_height_size,
+                        output_width_size, output_depth_size});
+    output_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    output_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
+
+  // Requantize if activation was not None.
+  if (activation != kTfLiteActNone) {
+    auto* requantized_min_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape.data(), reinterpret_cast<char*>(&output_min_),
+        sizeof(output_min_));
+    auto* requantized_max_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape.data(), reinterpret_cast<char*>(&output_max_),
+        sizeof(output_max_));
+    auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+    requantize_op->SetOpType(OP_Requantize_8to8);
+    requantize_op->AddInput(output);
+    requantize_op->AddInput(output_min);
+    requantize_op->AddInput(output_max);
+    requantize_op->AddInput(TensorID(requantized_min_const->GetID(), 0));
+    requantize_op->AddInput(TensorID(requantized_max_const->GetID(), 0));
+    node_output_ =
+        requantize_op->AddOutput(sizeof(uint8_t), 4,
+                                 {output_batch_size, output_height_size,
+                                  output_width_size, output_depth_size});
+    requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  } else {
+    node_output_ = output;
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc
index eb755729267..7a69d56b349 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc
@@ -58,7 +58,7 @@ TfLiteStatus ReshapeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   AddInput(graph_builder_->GetHexagonTensorId(inputs->data[0]));
 
   // Output shape.
-  TfLiteTensor* shape_tensor;
+  TfLiteTensor* shape_tensor = nullptr;
   bool output_shape_is_dynamic = false;
   if (inputs->size == 2) {
     shape_tensor = &context->tensors[inputs->data[1]];
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index a81839aed85..11365d3d3d2 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -27,7 +27,7 @@ hexagon_op_tests(
         "activations_test.cc",
         "arg_min_max_test.cc",
         "concat_test.cc",
-        "depthwise_conv_test.cc",
+        "conv_test.cc",
         "neg_test.cc",
         "pad_test.cc",
         "pool_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc
new file mode 100644
index 00000000000..39694e415a7
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc
@@ -0,0 +1,227 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <numeric>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+int NumElements(const std::vector<int>& dims) {
+  return std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int>());
+}
+
+class QuantizedConvolutionOpModel : public SingleOpModelWithHexagon {
+ public:
+  QuantizedConvolutionOpModel(BuiltinOperator type, const TensorData& input,
+                              const TensorData& filter,
+                              const TensorData& output, Padding padding_type,
+                              int dilation_factor = 1, int stride_length = 1,
+                              ActivationFunctionType fused_activation_function =
+                                  ActivationFunctionType_NONE) {
+    input_ = AddInput(input);
+    uint8_t zero = static_cast<uint8_t>(0);
+
+    if (NumElements(filter.shape) == 9) {
+      filter_ = AddConstInput(
+          filter, {zero, zero, zero, zero, zero, zero, zero, zero, zero});
+    } else if (NumElements(filter.shape) == 12) {
+      filter_ = AddConstInput(filter, {zero, zero, zero, zero, zero, zero, zero,
+                                       zero, zero, zero, zero, zero});
+    }
+
+    int bias_size = GetShape(filter_)[0];
+    // per tensor quantization.
+    auto bias_scale = GetScale(input_) * GetScale(filter_);
+    TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+    bias_ = AddInput(bias);
+
+    output_ = AddOutput(output);
+
+    if (type == BuiltinOperator_DEPTHWISE_CONV_2D) {
+      int input_depth = GetShape(input_)[3];
+      int output_depth = GetShape(filter_)[3];
+      int depth_mul = output_depth / input_depth;
+      SetBuiltinOp(
+          BuiltinOperator_DEPTHWISE_CONV_2D,
+          BuiltinOptions_DepthwiseConv2DOptions,
+          CreateDepthwiseConv2DOptions(
+              builder_, padding_type, stride_length, stride_length, depth_mul,
+              fused_activation_function, dilation_factor, dilation_factor)
+              .Union());
+    } else {
+      SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
+                   CreateConv2DOptions(builder_, padding_type, stride_length,
+                                       stride_length, fused_activation_function,
+                                       dilation_factor, dilation_factor)
+                       .Union());
+    }
+
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+TEST(QuantizedConvolutionOpModel, SimpleDilatedDepthwiseConvTestPaddingValid) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int dilation_factor = 3;
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_DEPTHWISE_CONV_2D,
+      {TensorType_UINT8,
+       {image_batch_count, image_height, image_width, depth},
+       0,
+       255},
+      {TensorType_UINT8,
+       {depth, filter_size, filter_size, filter_count},
+       0,
+       255},
+      {TensorType_UINT8, {}, 0, 255}, Padding_VALID, dilation_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.ApplyDelegateAndInvoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
+TEST(QuantizedConvolutionOpModel, SimpleConvTestNoActivation) {
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_CONV_2D, {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+      {TensorType_UINT8, {}, -127, 128}, Padding_VALID, /**dilation_factor**/ 1,
+      /**stride**/ 2);
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      18, 2, 5,  // first batch, left
+                      18, 2, 5,  // first batch, right
+                      17, 4, 3,  // second batch, left
+                      37, 4, 3,  // second batch, right
+                  },
+                  1e-5)));
+}
+
+TEST(QuantizedConvolutionOpModel, SimpleConvTestReLU6Activation) {
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_CONV_2D, {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+      {TensorType_UINT8, {}, -127, 128}, Padding_VALID, /**dilation_factor**/ 1,
+      /**stride**/ 2, ActivationFunctionType_RELU6);
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                            {
+                                                6, 2, 5,  // first batch, left
+                                                6, 2, 5,  // first batch, right
+                                                6, 4, 3,  // second batch, left
+                                                6, 4, 3,  // second batch, right
+                                            },
+                                            1e-5)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/depthwise_conv_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/depthwise_conv_test.cc
deleted file mode 100644
index 30966b0181b..00000000000
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/depthwise_conv_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
-
-namespace tflite {
-using testing::ElementsAreArray;
-
-class QuantizedDepthwiseConvolutionOpModel : public SingleOpModelWithHexagon {
- public:
-  QuantizedDepthwiseConvolutionOpModel(const TensorData& input,
-                                       const TensorData& filter,
-                                       const TensorData& output,
-                                       Padding padding_type,
-                                       int dilation_factor = 1) {
-    input_ = AddInput(input);
-    uint8_t zero = static_cast<uint8_t>(0);
-    filter_ = AddConstInput(
-        filter, {zero, zero, zero, zero, zero, zero, zero, zero, zero});
-
-    int bias_size = GetShape(filter_)[3];
-    // per tensor quantization.
-    auto bias_scale = GetScale(input_) * GetScale(filter_);
-    TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
-    bias_ = AddInput(bias);
-
-    output_ = AddOutput(output);
-
-    int input_depth = GetShape(input_)[3];
-    int output_depth = GetShape(filter_)[3];
-    int depth_mul = output_depth / input_depth;
-
-    SetBuiltinOp(
-        BuiltinOperator_DEPTHWISE_CONV_2D,
-        BuiltinOptions_DepthwiseConv2DOptions,
-        CreateDepthwiseConv2DOptions(builder_, padding_type, 1, 1, depth_mul,
-                                     ActivationFunctionType_NONE,
-                                     dilation_factor, dilation_factor)
-            .Union());
-
-    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
-  }
-
-  void SetInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
-  }
-
-  void SetFilter(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(filter_, data);
-  }
-
-  void SetBias(std::initializer_list<float> data) {
-    QuantizeAndPopulate<int32_t>(bias_, data);
-  }
-
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
-  }
-
- protected:
-  int input_;
-  int filter_;
-  int bias_;
-  int output_;
-};
-
-TEST(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) {
-  const int depth = 1;
-  const int image_width = 9;
-  const int image_height = 9;
-  const int image_batch_count = 1;
-  const int filter_size = 3;
-  const int filter_count = 1;
-  const int dilation_factor = 3;
-  QuantizedDepthwiseConvolutionOpModel m(
-      {TensorType_UINT8,
-       {image_batch_count, image_height, image_width, depth},
-       0,
-       255},
-      {TensorType_UINT8,
-       {depth, filter_size, filter_size, filter_count},
-       0,
-       255},
-      {TensorType_UINT8, {}, 0, 255}, Padding_VALID, dilation_factor);
-
-  // The image matrix is:
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // clang-format off
-  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0});
-  // clang-format on
-  // The filter matrix is:
-  // | 1 | 2 | 3 |
-  // | 4 | 5 | 6 |
-  // | 7 | 8 | 9 |
-  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
-  // No bias for this test.
-  m.SetBias({0});
-  m.ApplyDelegateAndInvoke();
-
-  // Since the dilation rate is 3 this will reduce the size of the output from
-  // 10x10 to 3x3 of all 5s. Specifically:
-  // | 5 | 5 | 5 |
-  // | 5 | 5 | 5 |
-  // | 5 | 5 | 5 |
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.h b/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.h
index 812eb792a5c..d142edd4d03 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn_init.h
@@ -21,6 +21,7 @@ extern "C" {
 void hexagon_nn_global_teardown(void);
 void hexagon_nn_global_init(void);
 bool hexagon_nn_is_device_supported();
+int hexagon_nn_hexagon_interface_version(void);
 #ifdef __cplusplus
 }
 #endif
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/version_scripts.lds b/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/version_scripts.lds
index 1e254e7eb0e..7b003afc770 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/version_scripts.lds
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/version_scripts.lds
@@ -19,6 +19,7 @@ VERS_1.0 {
     hexagon_nn_global_init;
     hexagon_nn_is_device_supported;
     hexagon_nn_version;
+    hexagon_nn_hexagon_interface_version;
 
   # Hide everything else.
   local:
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index 48e434a9591..303741ba89d 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -258,6 +258,10 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, False)
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
+    # Test MLIR-Converted model.
+    result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, True)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
     sess = tf.compat.v1.Session(config=CONFIG)
@@ -278,6 +282,10 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, False)
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
+    # Test MLIR-converted model.
+    result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, True)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
index 47799b705a3..1834d4738e9 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -249,6 +249,10 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, False)
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
+    # Test MLIR-converted model.
+    result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, True)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
     sess = tf.compat.v1.Session(config=CONFIG)
@@ -269,6 +273,10 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, False)
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
+    # Test MLIR-converted model.
+    result = self.tfliteInvoke(new_sess, test_inputs, x, output_class, True)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index 18b53980133..ff7e8fa58e9 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -1,6 +1,6 @@
 # TensorFlow Lite for Objective-C
 
-load("//tensorflow/lite:special_rules.bzl", "ios_visibility_whitelist")
+load("//tensorflow/lite:special_rules.bzl", "ios_visibility_whitelist", "tflite_ios_lab_runner")
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 
@@ -72,6 +72,7 @@ ios_unit_test(
     name = "Tests",
     size = "medium",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS + [
         "nozapfhahn",  # TODO(b/145984659): Enable after solving tool failure.
     ],
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
index c7662d149e9..4a1e7d4a65e 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
@@ -24,6 +24,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import org.checkerframework.checker.nullness.qual.NonNull;
@@ -46,10 +47,28 @@ public class FileUtil {
   @NonNull
   public static List<String> loadLabels(@NonNull Context context, @NonNull String filePath)
       throws IOException {
+    return loadLabels(context, filePath, Charset.defaultCharset());
+  }
+
+  /**
+   * Loads labels from the label file into a list of strings.
+   *
+   * <p>A legal label file is the plain text file whose contents are split into lines, and each line
+   * is an individual value. The file should be in assets of the context.
+   *
+   * @param context The context holds assets.
+   * @param filePath The path of the label file, relative with assets directory.
+   * @param cs {@code Charset} to use when decoding content of label file.
+   * @return a list of labels.
+   * @throws IOException if error occurs to open or read the file.
+   */
+  @NonNull
+  public static List<String> loadLabels(
+      @NonNull Context context, @NonNull String filePath, Charset cs) throws IOException {
     SupportPreconditions.checkNotNull(context, "Context cannot be null.");
     SupportPreconditions.checkNotNull(filePath, "File path cannot be null.");
     InputStream inputStream = context.getAssets().open(filePath);
-    return loadLabels(inputStream);
+    return loadLabels(inputStream, cs);
   }
 
   /**
@@ -62,8 +81,23 @@ public class FileUtil {
    */
   @NonNull
   public static List<String> loadLabels(@NonNull InputStream inputStream) throws IOException {
+    return loadLabels(inputStream, Charset.defaultCharset());
+  }
+
+  /**
+   * Loads labels from an input stream of an opened label file. See details for label files in
+   * {@link FileUtil#loadLabels(Context, String)}.
+   *
+   * @param inputStream the input stream of an opened label file.
+   * @param cs {@code Charset} to use when decoding content of label file.
+   * @return a list of labels.
+   * @throws IOException if error occurs to open or read the file.
+   */
+  @NonNull
+  public static List<String> loadLabels(@NonNull InputStream inputStream, Charset cs)
+      throws IOException {
     List<String> labels = new ArrayList<>();
-    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
+    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, cs));
     String line;
     while ((line = reader.readLine()) != null) {
       labels.add(line);
@@ -72,6 +106,38 @@ public class FileUtil {
     return labels;
   }
 
+  /**
+   * Loads a vocabulary file (a single-column text file) into a list of strings.
+   *
+   * <p>A vocabulary file is a single-column plain text file whose contents are split into lines,
+   * and each line is an individual value. The file should be in assets of the context.
+   *
+   * @param context The context holds assets.
+   * @param filePath The path of the vocabulary file, relative with assets directory.
+   * @return a list of vocabulary words.
+   * @throws IOException if error occurs to open or read the file.
+   */
+  @NonNull
+  public static List<String> loadSingleColumnTextFile(
+      @NonNull Context context, @NonNull String filePath, Charset cs) throws IOException {
+    return loadLabels(context, filePath, cs);
+  }
+
+  /**
+   * Loads vocabulary from an input stream of an opened vocabulary file (which is a single-column
+   * text file). See details for vocabulary files in {@link FileUtil#loadVocabularyFile(Context,
+   * String)}.
+   *
+   * @param inputStream the input stream of an opened vocabulary file.
+   * @return a list of vocabulary words.
+   * @throws IOException if error occurs to open or read the file.
+   */
+  @NonNull
+  public static List<String> loadSingleColumnTextFile(@NonNull InputStream inputStream, Charset cs)
+      throws IOException {
+    return loadLabels(inputStream, cs);
+  }
+
   /**
    * Loads a file from the asset folder through memory mapping.
    *
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index ba014251bcc..2ce8428b1ce 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -1,6 +1,6 @@
 # TensorFlow Lite for Swift
 
-load("//tensorflow/lite:special_rules.bzl", "ios_visibility_whitelist")
+load("//tensorflow/lite:special_rules.bzl", "ios_visibility_whitelist", "tflite_ios_lab_runner")
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
@@ -25,6 +25,7 @@ ios_unit_test(
     name = "Tests",
     size = "small",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
         ":TestsLibrary",
diff --git a/tensorflow/lite/g3doc/convert/cmdline.md b/tensorflow/lite/g3doc/convert/cmdline.md
index 2d89c04e6f1..a6594d4a429 100644
--- a/tensorflow/lite/g3doc/convert/cmdline.md
+++ b/tensorflow/lite/g3doc/convert/cmdline.md
@@ -70,3 +70,47 @@ bazel run //tensorflow/lite/python:tflite_convert -- \
   --saved_model_dir=/tmp/mobilenet_saved_model \
   --output_file=/tmp/mobilenet.tflite
 ```
+
+### Custom ops in the new converter
+
+There is a behavior change in how models containing
+[custom ops](https://www.tensorflow.org/lite/guide/ops_custom) (those for which
+users use to set --allow\_custom\_ops before) are handled in the
+[new converter](https://github.com/tensorflow/tensorflow/blob/917ebfe5fc1dfacf8eedcc746b7989bafc9588ef/tensorflow/lite/python/lite.py#L81).
+
+**Built-in TensorFlow op**
+
+If you are converting a model with a built-in TensorFlow op that does not exist
+in TensorFlow Lite, you should set --allow\_custom\_ops argument (same as
+before), explained [here](https://www.tensorflow.org/lite/guide/ops_custom).
+
+**Custom op in TensorFlow**
+
+If you are converting a model with a custom TensorFlow op, it is recommended
+that you write a [TensorFlow kernel](https://www.tensorflow.org/guide/create_op)
+and [TensorFlow Lite kernel](https://www.tensorflow.org/lite/guide/ops_custom).
+This ensures that the model is working end-to-end, from TensorFlow and
+TensorFlow Lite. This also requires setting the --allow\_custom\_ops argument.
+
+**Advanced custom op usage (not recommended)**
+
+If the above is not possible, you can still convert a TensorFlow model
+containing a custom op without a corresponding kernel. You will need to pass the
+[OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
+of the custom op in TensorFlow using --custom\_opdefs flag, as long as you have
+the corresponding OpDef registered in the TensorFlow global op registry. This
+ensures that the TensorFlow model is valid (i.e. loadable by the TensorFlow
+runtime).
+
+If the custom op is not part of the global TensorFlow op registry, then the
+corresponding OpDef needs to be specified via the --custom\_opdefs flag. This is
+a list of an OpDef proto in string that needs to be additionally registered.
+Below is an example of an TFLiteAwesomeCustomOp with 2 inputs, 1 output, and 2
+attributes:
+
+```
+--custom\_opdefs="name: 'TFLiteAwesomeCustomOp' input\_arg: { name: 'InputA'
+type: DT\_FLOAT } input\_arg: { name: ‘InputB' type: DT\_FLOAT }
+output\_arg: { name: 'Output' type: DT\_FLOAT } attr : { name: 'Attr1' type:
+'float'} attr : { name: 'Attr2' type: 'list(float)'}"
+```
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index f9f79fb9a31..4c22d6ac860 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -180,3 +180,47 @@ either install the nightly build with
 [pip](https://www.tensorflow.org/install/pip) (recommended) or
 [Docker](https://www.tensorflow.org/install/docker), or
 [build the pip package from source](https://www.tensorflow.org/install/source).
+
+### Custom ops in the experimenal new converter
+
+There is a behavior change in how models containing
+[custom ops](https://www.tensorflow.org/lite/guide/ops_custom) (those for which
+users use to set allow\_custom\_ops before) are handled in the
+[new converter](https://github.com/tensorflow/tensorflow/blob/917ebfe5fc1dfacf8eedcc746b7989bafc9588ef/tensorflow/lite/python/lite.py#L81).
+
+**Built-in TensorFlow op**
+
+If you are converting a model with a built-in TensorFlow op that does not exist
+in TensorFlow Lite, you should set allow\_custom\_ops attribute (same as
+before), explained [here](https://www.tensorflow.org/lite/guide/ops_custom).
+
+**Custom op in TensorFlow**
+
+If you are converting a model with a custom TensorFlow op, it is recommended
+that you write a [TensorFlow kernel](https://www.tensorflow.org/guide/create_op)
+and [TensorFlow Lite kernel](https://www.tensorflow.org/lite/guide/ops_custom).
+This ensures that the model is working end-to-end, from TensorFlow and
+TensorFlow Lite. This also requires setting the allow\_custom\_ops attribute.
+
+**Advanced custom op usage (not recommended)**
+
+If the above is not possible, you can still convert a TensorFlow model
+containing a custom op without a corresponding kernel. You will need to pass the
+[OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
+of the custom op in TensorFlow using --custom\_opdefs flag, as long as you have
+the corresponding OpDef registered in the TensorFlow global op registry. This
+ensures that the TensorFlow model is valid (i.e. loadable by the TensorFlow
+runtime).
+
+If the custom op is not part of the global TensorFlow op registry, then the
+corresponding OpDef needs to be specified via the --custom\_opdefs flag. This is
+a list of an OpDef proto in string that needs to be additionally registered.
+Below is an example of an TFLiteAwesomeCustomOp with 2 inputs, 1 output, and 2
+attributes:
+
+```
+converter.custom\_opdefs="name: 'TFLiteAwesomeCustomOp' input\_arg: { name: 'InputA'
+type: DT\_FLOAT } input\_arg: { name: ‘InputB' type: DT\_FLOAT }
+output\_arg: { name: 'Output' type: DT\_FLOAT } attr : { name: 'Attr1' type:
+'float'} attr : { name: 'Attr2' type: 'list(float)'}"
+```
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index 242169cd1db..5304fe7a455 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -49,8 +49,8 @@ Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite
 Note: The model files include both TF Lite FlatBuffer and Tensorflow frozen
 Graph.
 
-Note: Performance numbers were benchmarked on Pixel-2 using single thread large
-core. Accuracy numbers were computed using the
+Note: Performance numbers were benchmarked on Pixel-3 (Android 10).
+Accuracy numbers were computed using the
 [TFLite accuracy tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/accuracy/ilsvrc).
 
 ### Floating point models
@@ -105,8 +105,9 @@ MnasNet_1.0_192  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https:
 MnasNet_1.0_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz)  | 17 Mb      | 74.08%         | 91.75%         | 19.4 ms        | 8.7 ms  | 19 ms
 MnasNet_1.3_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz)  | 24 Mb      | 75.24%         | 92.55%         | 27.9 ms        | 10.6 ms | 22.0 ms
 
-Note: Performance numbers were benchmarked on Pixel-1 using single thread large
-BIG core.
+Note: Performance numbers were benchmarked on Pixel-3 (Android 10).
+Accuracy numbers were computed using the
+[TFLite accuracy tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/accuracy/ilsvrc).
 
 ## Object detection
 
diff --git a/tensorflow/lite/g3doc/guide/ops_version.md b/tensorflow/lite/g3doc/guide/ops_version.md
index 610e13c1b9c..1273ed306e6 100644
--- a/tensorflow/lite/g3doc/guide/ops_version.md
+++ b/tensorflow/lite/g3doc/guide/ops_version.md
@@ -209,8 +209,8 @@ is supported for every node in Delegation code.
 ```
 const int kMinVersion = 1;
 TfLiteNode* node;
-TfLiteRegistration;
-context->GetNodeAndRegistration(context, node_index, &node, &registration);
+TfLiteRegistration* registration = nullptr;
+TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(context, node_index, &node, &registration));
 
 if (registration->version > kMinVersion) {
   // Reject the node if the version isn't supported.
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
index 8ceeb7515c4..816ba9244f3 100644
--- a/tensorflow/lite/g3doc/models/segmentation/overview.md
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -8,10 +8,13 @@ _DeepLab_ is a state-of-art deep learning model for semantic image segmentation,
 where the goal is to assign semantic labels (e.g. person, dog, cat) to every
 pixel in the input image.
 
-If you are new to TensorFlow Lite and are working with iOS, we
+If you are new to TensorFlow Lite and are working with Android or iOS, we
 recommend exploring the following example applications that can help you get
 started.
 
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/android">Android
+example</a>
+
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_segmentation/ios">iOS
 example</a>
 
diff --git a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
index 4d6f31b8f62..e71b07661a0 100644
--- a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
+++ b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
@@ -82,7 +82,17 @@
         "\n",
         "![Style transfer example](https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/table.png)\n",
         "\n",
-        "This tutorial shows how to use a pre-trained TensorFlow Lite model to apply style transfer on any pair of content and style image. You can use the pre-trained model to add style transfer to your own mobile applications.\n",
+        "If you are new to TensorFlow Lite and are working with Android, we\n",
+        "recommend exploring the following example applications that can help you get\n",
+        "started.\n",
+        "\n",
+        "\u003ca class=\"button button-primary\" href=\"https://github.com/tensorflow/examples/tree/master/lite/examples/style_transfer/android\"\u003eAndroid\n",
+        "example\u003c/a\u003e\n",
+        "\n",
+        "If you are using a platform other than Android or iOS, or you are already\n",
+        "familiar with the\n",
+        "\u003ca href=\"https://www.tensorflow.org/api_docs/python/tf/lite\"\u003eTensorFlow Lite\n",
+        "APIs\u003c/a\u003e, you can follow this tutorial to learn how to apply style transfer on any pair of content and style image with a pre-trained TensorFlow Lite model. You can use the model to add style transfer to your own mobile applications.\n",
         "\n",
         "The model is open-sourced on [GitHub](https://github.com/tensorflow/magenta/tree/master/magenta/models/arbitrary_image_stylization#train-a-model-on-a-large-dataset-with-data-augmentation-to-run-on-mobile). You can retrain the model with different parameters (e.g. increase content layers' weights to make the output image look more like the content image)."
       ]
@@ -475,30 +485,29 @@
         "imshow(stylized_image_blended, 'Blended Stylized Image')"
       ]
     },
-    
-    
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "MQZXL7kON-gM"
+        "id": "9k9jGIep8p1c"
       },
       "source": [
         "## Performance Benchmarks\n",
         "\n",
         "Performance benchmark numbers are generated with the tool [described here](https://www.tensorflow.org/lite/performance/benchmarks).\n",
-        "<table ><thead><tr><th>Model name</th> <th>Model size</th>  <th>Device </th> <th>NNAPI</th> <th>CPU</th></tr> </thead> \n",
-        "<tr> <td rowspan = 3> <a href=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_predict_quantized_256.tflite\">Style prediction model</a> </td> \n",
-        "<td rowspan = 3>2.8 Mb</td>\n",
-        "<td>Pixel 3 (Android 10) </td> <td>142ms</td><td>14ms*</td></tr>\n",
-        "<tr><td>Pixel 4 (Android 10) </td> <td>5.2ms</td><td>6.7ms*</td></tr>\n",
-        "<tr><td>iPhone XS (iOS 12.4.1) </td> <td></td><td>10.7ms**</td></tr>\n",
-        "<tr> <td rowspan = 3> <a href=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_transfer_quantized_dynamic.tflite\">Style transform model</a> </td> \n",
-        "<td rowspan = 3>0.2s Mb</td>\n",
-        "<td>Pixel 3 (Android 10) </td> <td></td><td>540ms*</td></tr>\n",
-        "<tr><td>Pixel 4 (Android 10) </td> <td></td><td>405ms*</td></tr>\n",
-        "<tr><td>iPhone XS (iOS 12.4.1) </td> <td></td><td>251ms**</td></tr></table>\n",
-        "* 4 threads used.\n\n",
+        "\u003ctable \u003e\u003cthead\u003e\u003ctr\u003e\u003cth\u003eModel name\u003c/th\u003e \u003cth\u003eModel size\u003c/th\u003e  \u003cth\u003eDevice \u003c/th\u003e \u003cth\u003eNNAPI\u003c/th\u003e \u003cth\u003eCPU\u003c/th\u003e\u003c/tr\u003e \u003c/thead\u003e \n",
+        "\u003ctr\u003e \u003ctd rowspan = 3\u003e \u003ca href=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_predict_quantized_256.tflite\"\u003eStyle prediction model\u003c/a\u003e \u003c/td\u003e \n",
+        "\u003ctd rowspan = 3\u003e2.8 Mb\u003c/td\u003e\n",
+        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e142ms\u003c/td\u003e\u003ctd\u003e14ms*\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e \u003ctd\u003e5.2ms\u003c/td\u003e\u003ctd\u003e6.7ms*\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003eiPhone XS (iOS 12.4.1) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e10.7ms**\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e \u003ctd rowspan = 3\u003e \u003ca href=\"https://storage.googleapis.com/download.tensorflow.org/models/tflite/arbitrary_style_transfer/style_transfer_quantized_dynamic.tflite\"\u003eStyle transform model\u003c/a\u003e \u003c/td\u003e \n",
+        "\u003ctd rowspan = 3\u003e0.2s Mb\u003c/td\u003e\n",
+        "\u003ctd\u003ePixel 3 (Android 10) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e540ms*\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003ePixel 4 (Android 10) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e405ms*\u003c/td\u003e\u003c/tr\u003e\n",
+        "\u003ctr\u003e\u003ctd\u003eiPhone XS (iOS 12.4.1) \u003c/td\u003e \u003ctd\u003e\u003c/td\u003e\u003ctd\u003e251ms**\u003c/td\u003e\u003c/tr\u003e\u003c/table\u003e\n",
+        "* 4 threads used.\n",
+        "\n",
         "** 2 threads on iPhone for the best performance.\n",
         "\n"
       ]
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 82d7fb33e08..2974b26f574 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -614,12 +614,12 @@ cc_library(
         ":op_macros",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/ruy/profiler:instrumentation",
+        "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:tensor_utils",
-        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index ddca22c1df8..34d0556e7bd 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -114,16 +114,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);
 
   const TfLiteType data_type = input->type;
+
   const TfLiteType filter_type = filter->type;
   const bool is_hybrid =
       data_type == kTfLiteFloat32 && filter_type == kTfLiteInt8;
-  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32 ||
-                              data_type == kTfLiteUInt8 ||
-                              data_type == kTfLiteInt8);
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
+                     data_type == kTfLiteInt8 || data_type == kTfLiteInt16);
   TF_LITE_ENSURE_EQ(context, output->type, data_type);
   if (!is_hybrid) {
-    TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+    TF_LITE_ENSURE(context,
+                   filter->type == data_type || data_type == kTfLiteInt16);
   }
+
   // Filter in DepthwiseConv is expected to be [1, H, W, O].
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);
 
@@ -132,6 +135,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+    } else if (data_type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
+      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       TF_LITE_ENSURE_EQ(context, bias->type, data_type);
     }
@@ -398,6 +406,34 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalQuantizedPerChannel16x8(
+    const TfLiteDepthwiseConvParams* params, const OpData* data,
+    const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.weights_offset = 0;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier.data(),
+      data->per_channel_output_shift.data(), GetTensorShape(input),
+      GetTensorData<int16>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<std::int64_t>(bias), GetTensorShape(output),
+      GetTensorData<int16>(output));
+
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
                                   TfLiteDepthwiseConvParams* params,
@@ -435,6 +471,7 @@ TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
+
   op_params.weights_offset = 0;
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
@@ -457,6 +494,7 @@ TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
         GetTensorData<float>(output), affine_quantization->scale->data,
         input_offset_ptr, CpuBackendContext::GetFromContext(context));
   }
+
   return kTfLiteOk;
 }
 
@@ -495,6 +533,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8:
       return EvalQuantizedPerChannel<kernel_type>(context, node, params, data,
                                                   input, filter, bias, output);
+      break;
+    case kTfLiteInt16:
+      return EvalQuantizedPerChannel16x8(params, data, input, filter, bias,
+                                         output);
+      break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
@@ -513,6 +556,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
     case kTfLiteInt8:
       return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
+    case kTfLiteInt16:
+      return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index aeddc71c685..344d156545d 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -71,7 +71,11 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
               input.scale * filter.per_channel_quantization_scales[i];
           bias_zero_points[i] = 0;
         }
-        TensorData bias{TensorType_INT32,
+        tflite::TensorType bias_type = TensorType_INT32;
+        if (input.type == TensorType_INT16) {
+          bias_type = TensorType_INT64;
+        }
+        TensorData bias{bias_type,
                         {bias_size},
                         /*min=*/0,
                         /*max=*/0,
@@ -1820,7 +1824,7 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple3x3FilterTest) {
        0,
        /*per_channel_quantization=*/true,
        /*per_channel_quantization_scales=*/
-       {1, 2, 3, 4, 4, 3, 2, 1},
+       {0.1, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0.1},
        /*per_channel_quantization_offsets=*/{0, 0, 0, 0, 0, 0, 0, 0},
        /*channel_index=*/3},
       {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
@@ -1840,7 +1844,7 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple3x3FilterTest) {
   // Invoke and verify output.
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear({9, 18, 0, 0, 36, 54, 0, 0})));
+              ElementsAreArray(ArrayFloatNear({9, 18, 0, 0, 47, 54, 0, 0})));
 }
 
 TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
@@ -1856,7 +1860,7 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
        0,
        /*per_channel_quantization=*/true,
        /*per_channel_quantization_scales=*/
-       {1, 2, 3, 4, 4, 3, 2, 1},
+       {0.1, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0.1},
        /*per_channel_quantization_offsets=*/{0, 0, 0, 0, 0, 0, 0, 0},
        /*channel_index=*/3},
       {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_SAME);
@@ -1875,15 +1879,15 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
 
   // Invoke and verify output.
   m.Invoke();
-  EXPECT_THAT(
-      m.GetDequantizedOutput(),
-      ElementsAreArray(ArrayFloatNear({
-          // array of 9 x 8 => [1, 3, 3, 8]
-          4,  8,  0, 0, 16, 24, 0, 0, 6,  12, 0, 0, 24, 36, 0, 0, 4,  8,  0, 0,
-          16, 24, 0, 0, 6,  12, 0, 0, 24, 36, 0, 0, 9,  18, 0, 0, 36, 54, 0, 0,
-          6,  12, 0, 0, 24, 36, 0, 0, 4,  8,  0, 0, 16, 24, 0, 0, 6,  12, 0, 0,
-          24, 36, 0, 0, 4,  8,  0, 0, 16, 24, 0, 0,
-      })));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  // array of 9 x 8 => [1, 3, 3, 8]
+                  4, 8,  0, 0, 21, 24, 0, 0, 6, 12, 0, 0, 31.5, 36, 0, 0,
+                  4, 8,  0, 0, 21, 24, 0, 0, 6, 12, 0, 0, 31.5, 36, 0, 0,
+                  9, 18, 0, 0, 47, 54, 0, 0, 6, 12, 0, 0, 31.5, 36, 0, 0,
+                  4, 8,  0, 0, 21, 24, 0, 0, 6, 12, 0, 0, 31.5, 36, 0, 0,
+                  4, 8,  0, 0, 21, 24, 0, 0,
+              })));
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/lite/kernels/exp_test.cc b/tensorflow/lite/kernels/exp_test.cc
index 4cb8a5524fa..b6f73169c4b 100644
--- a/tensorflow/lite/kernels/exp_test.cc
+++ b/tensorflow/lite/kernels/exp_test.cc
@@ -50,14 +50,17 @@ class ExpOpModel : public SingleOpModel {
 };
 
 TEST(ExpOpTest, FloatTest) {
-  std::initializer_list<float> data = {1.0, 0.0, -1.0, 1.0, 1.0, -1.0};
-  ExpOpModel m({TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32);
+  std::initializer_list<float> data = {0.0f,    1.0f,  -1.0f, 100.0f,
+                                       -100.0f, 0.01f, -0.01f};
+  ExpOpModel m({TensorType_FLOAT32, {1, 1, 7}}, TensorType_FLOAT32);
   m.SetInput<float>(data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
-  EXPECT_THAT(m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {2.71828, 1, 0.367879, 2.71828, 2.71828, 0.367879})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 7}));
+  EXPECT_THAT(
+      m.GetOutput<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {std::exp(0.0f), std::exp(1.0f), std::exp(-1.0f), std::exp(100.0f),
+           std::exp(-100.0f), std::exp(0.01f), std::exp(-0.01f)})));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index fe39971f303..0291467694c 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -195,6 +195,11 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
         &data->output_activation_max));
   }
 
+  if (input->type == kTfLiteInt16 && output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  }
+
   // If we have to perform on-the-fly quantization (with quantized weights and
   // float inputs) first we need to quantize the inputs. Allocate a temporary
   // buffer to store the intermediate quantized values.
@@ -432,9 +437,7 @@ void FullyConnectedInt16(const OpData* data, const TfLiteTensor* input,
                          const TfLiteTensor* filter, const TfLiteTensor* bias,
                          TfLiteTensor* output) {
   FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
   op_params.output_multiplier = data->output_multiplier;
   op_params.output_shift = data->output_shift;
   op_params.quantized_activation_min = data->output_activation_min;
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 21273cd7a3a..6c8337f17ab 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -540,11 +540,11 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
 }
 
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16) {
-  const float ulp = (float)1 / (float)512;
+  const float scale = 128.0 / 65536;
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
-      /*input=*/{TensorType_INT16, {2, 10}, -64 + ulp, 64},
-      /*output=*/{TensorType_INT16, {}, -128 + 2 * ulp, 128});
+      /*input=*/{TensorType_INT16, {2, 10}, 0, 0, scale, 0},
+      /*output=*/{TensorType_INT16, {}, 0, 0, scale, 0});
 
   // input_product_scale < output_scale was not true.
   m.SetWeights<int8_t>({
@@ -564,8 +564,7 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt16) {
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60})));
   EXPECT_THAT(m.GetOutput<int16_t>(),
-              ElementsAre(24 * 256 - 1, 25 * 256 - 1, 26 * 256 - 1,
-                          58 * 256 - 1, 59 * 256 - 1, 60 * 256 - 1));
+              ElementsAre(12288, 12800, 13312, 29696, 30208, 30720));
 }
 
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8NoBias) {
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 03bdfd1cf36..c7a76d79e48 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -843,6 +843,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "depthwiseconv_per_channel_quantized_16x8_test",
+    srcs = [
+        "depthwiseconv_per_channel_quantized_16x8_test.cc",
+    ],
+    shard_count = 2,
+    deps = [
+        ":common",
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_16x8_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_16x8_test.cc
new file mode 100644
index 00000000000..7d586c5ac94
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_16x8_test.cc
@@ -0,0 +1,320 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+
+void PickOutputMultiplier(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const int16* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    float* output_multiplier) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  std::int64_t output_accu_min = std::numeric_limits<std::int64_t>::max();
+  std::int64_t output_accu_max = std::numeric_limits<std::int64_t>::min();
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            output_accu_max = std::max(acc, output_accu_max);
+            output_accu_min = std::min(acc, output_accu_min);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int16 ranges from -32768 to 32767, we need to squeeze the accumulator
+  // min/max fit in those ranges correspondingly as much as possible.
+  if (std::abs(output_accu_max) > std::abs(output_accu_min)) {
+    *output_multiplier = 32767.0f / std::abs(output_accu_max);
+  } else {
+    *output_multiplier = 32768.0f / std::abs(output_accu_min);
+  }
+}
+
+void PickReasonableMultiplier(
+    const DepthwiseParams& params, int output_activation_min,
+    int output_activation_max, int output_depth,
+    const RuntimeShape& input_shape_inference, const std::int16_t* input_data,
+    const RuntimeShape& filter_shape_inference, const std::int8_t* filter_data,
+    const RuntimeShape& bias_shape_inference, const std::int64_t* bias_data,
+    const RuntimeShape& output_shape_inference,
+    std::int32_t* output_multiplier_ptr, std::int32_t* output_shift_ptr,
+    std::int16_t* output_data) {
+  float output_multiplier;
+  PickOutputMultiplier(params, input_shape_inference, input_data,
+                       filter_shape_inference, filter_data,
+                       bias_shape_inference, bias_data, output_shape_inference,
+                       &output_multiplier);
+
+  int base_multiplier;
+  int base_shift;
+  QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
+  for (int i = 0; i < output_depth; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
+    output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
+  }
+}
+
+bool GenerateValidShapeConfigurations(
+    int filter_width, int filter_height, int depth_multiplier,
+    int dilation_width_factor, int dilation_height_factor,
+    RuntimeShape* input_shape_inference, RuntimeShape* filter_shape_inference,
+    RuntimeShape* output_shape_inference, int* pad_width, int* pad_height,
+    int* stride) {
+  const int batch = UniformRandomInt(1, 3);
+  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  const int input_width = UniformRandomInt(5, 50);
+  const int input_height = UniformRandomInt(5, 50);
+  *stride = UniformRandomInt(1, 2);
+  const bool test_pad = UniformRandomInt(0, 1);
+  const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
+
+  const int output_depth = input_depth * depth_multiplier;
+
+  input_shape_inference->BuildFrom(
+      {batch, input_height, input_width, input_depth});
+
+  filter_shape_inference->BuildFrom(
+      {1, filter_height, filter_width, output_depth});
+
+  EXPECT_TRUE(ComputeConvSizes(
+      *input_shape_inference, output_depth, filter_width, filter_height,
+      *stride, dilation_width_factor, dilation_height_factor, padding_type,
+      output_shape_inference, pad_width, pad_height));
+
+  return true;
+}
+
+void IntToFloat(std::vector<float>* d, std::vector<std::int8_t>* s) {
+  for (unsigned int i = 0; i < s->size(); i++) {
+    d->data()[i] = (float)s->data()[i];
+  }
+}
+
+void IntToFloat(std::vector<float>* d, std::vector<std::int64_t>* s) {
+  for (unsigned int i = 0; i < s->size(); i++) {
+    d->data()[i] = (float)s->data()[i];
+  }
+}
+
+void TryTestOneDepthwiseConv3x3Filter() {
+  const int filter_width = 3;
+  const int filter_height = 3;
+  const int depth_multiplier = 1;
+  // We don't support dilations in the 3x3 filter.
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+
+  const int output_activation_min = -32768;
+  const int output_activation_max = 32767;
+
+  RuntimeShape input_shape_inference;
+  RuntimeShape filter_shape_inference;
+  RuntimeShape output_shape_inference;
+  int pad_width, pad_height;
+  int stride;
+
+  // Keeps trying until we get valid shape/configurations for 3x3 filter case.
+  bool generated_valid_configurations_for_3x3_kernel = false;
+  while (!generated_valid_configurations_for_3x3_kernel) {
+    generated_valid_configurations_for_3x3_kernel =
+        GenerateValidShapeConfigurations(
+            filter_width, filter_height, depth_multiplier,
+            dilation_width_factor, dilation_height_factor,
+            &input_shape_inference, &filter_shape_inference,
+            &output_shape_inference, &pad_width, &pad_height, &stride);
+  }
+
+  const int output_depth = output_shape_inference.Dims(3);
+
+  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
+  const int input_buffer_size = input_shape_inference.FlatSize();
+  const int filter_buffer_size = filter_shape_inference.FlatSize();
+  const int output_buffer_size = output_shape_inference.FlatSize();
+  std::vector<std::int16_t> input_data(input_buffer_size);
+  std::vector<std::int8_t> filter_data(filter_buffer_size);
+  std::vector<std::int64_t> bias_data(output_depth);
+
+  FillRandom(&input_data);
+  FillRandom(&filter_data);
+  for (int i = 0; i < output_depth; i++) {
+    bias_data.data()[i] = 0;
+  }
+
+  DepthwiseParams params;
+  params.stride_width = stride;
+  params.stride_height = stride;
+  params.dilation_height_factor = dilation_height_factor;
+  params.dilation_width_factor = dilation_width_factor;
+  params.padding_values.width = pad_width;
+  params.padding_values.height = pad_height;
+  params.depth_multiplier = depth_multiplier;
+  params.weights_offset = 0;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+  params.float_activation_max = (float)(1LL << 40);
+  params.float_activation_min = -params.float_activation_max;
+
+  std::vector<std::int16_t> reference_output_data(output_buffer_size);
+  std::vector<std::int16_t> neon_output_data(output_buffer_size);
+
+  std::vector<std::int32_t> output_multiplier(output_depth);
+  std::vector<std::int32_t> output_shift(output_depth);
+
+  // It's hard to come up with a right multiplier, random guess basically makes
+  // all the results saturated and becomes meaningfulless, so we first use
+  // reference impl to poke the min/max value of the accumulation, then use that
+  // value as a guided suggestion for us to populate meaningful mulitplier &
+  // shift.
+  PickReasonableMultiplier(
+      params, output_activation_min, output_activation_max, output_depth,
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, output_multiplier.data(), output_shift.data(),
+      reference_output_data.data());
+
+  // The following tests compare reference impl for 16x8 version and
+  // float reference operator.
+  reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier.data(), output_shift.data(),
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, reference_output_data.data());
+
+  std::vector<float> input_data_float(input_buffer_size);
+  std::vector<float> filter_data_float(filter_buffer_size);
+  std::vector<float> bias_data_float(output_depth);
+  std::vector<float> output_data_float(output_buffer_size);
+
+  for (int i = 0; i < input_buffer_size; i++) {
+    input_data_float.data()[i] = (float)(input_data.data()[i]);
+  }
+  IntToFloat(&filter_data_float, &filter_data);
+  IntToFloat(&bias_data_float, &bias_data);
+
+  reference_ops::DepthwiseConv(
+      params, input_shape_inference, input_data_float.data(),
+      filter_shape_inference, filter_data_float.data(), bias_shape_inference,
+      bias_data_float.data(), output_shape_inference, output_data_float.data());
+
+  for (int n = 0; n < output_shape_inference.Dims(0); n++) {
+    for (int h = 0; h < output_shape_inference.Dims(1); h++) {
+      for (int w = 0; w < output_shape_inference.Dims(2); w++) {
+        for (int c = 0; c < output_shape_inference.Dims(3); c++) {
+          int offset = Offset(output_shape_inference, n, h, w, c);
+          float float_res = output_data_float.data()[offset];
+          int16 int16_res = reference_output_data.data()[offset];
+          int32 output_mul = output_multiplier.data()[c];
+          int shift = output_shift.data()[c];
+          float scale = (float)output_mul / (float)(1ULL << 31);
+          if (shift > 0) scale = scale * (float)(1 << shift);
+          if (shift < 0) scale = scale / (float)(1 << -shift);
+          int ref_res = floor(float_res * scale + 0.5);
+          if (ref_res < output_activation_min) ref_res = output_activation_min;
+          if (ref_res > output_activation_max) ref_res = output_activation_max;
+          int e = (ref_res - int16_res);
+          if (e < 0) e = -e;
+          if (e > 1) {
+            printf(
+                "(%d,%d,%d,%d) scale=%08x shift=%d res=%d float=%f (%f,%f)\n",
+                n, h, w, c, output_mul, shift, int16_res, float_res * scale,
+                float_res, scale);
+            EXPECT_TRUE(false);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(QuantizedDepthwiseConvPerChannelTest, FastKernelTest) {
+  for (int i = 0; i < 30; ++i) {
+    TryTestOneDepthwiseConv3x3Filter();
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
index 3c809560c35..3fb824ca902 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
@@ -163,9 +163,8 @@ void CompareRoundingResults(int flat_size, const int depth_multiplier,
 
   // The tolerance that we apply to means is tight, but we allow for a rounding
   // difference in one pixel, and loosen by another 1% for float comparison.
-  float mean_tolerance =
-      std::max(1e-5f, 1.01f / flat_size * std::sqrt(1.f * depth_multiplier));
-  mean_tolerance = 500.f;
+  const float mean_tolerance =
+      std::max(1e-2f, 1.01f / flat_size * std::sqrt(1.f * depth_multiplier));
   const int diff_mean_tolerance = 256;
   const int diff_median_tolerance = 225;
 
@@ -347,7 +346,7 @@ void TryTestOneDepthwiseConv3x3Filter() {
 }
 
 TEST(QuantizedDepthwiseConvPerChannelTest, FastKernelTest) {
-  for (int i = 0; i < 30; ++i) {
+  for (int i = 0; i < 60; ++i) {
     TryTestOneDepthwiseConv3x3Filter();
   }
 }
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
index f7860e29e69..171475df107 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -261,12 +261,12 @@ inline DotProduct3x3KernelType CategorizeDotProductKernel(
   const int32 filter_height = filter_shape.Dims(1);
   const int32 filter_width = filter_shape.Dims(2);
 
-  bool supported =
-      stride == params.stride_height && stride <= 2 && padding <= 1 &&
-      filter_width == 3 && filter_height == 3 && params.output_shift <= 0 &&
-      params.dilation_width_factor == 1 && params.dilation_height_factor == 1 &&
-      (((input_depth % 8) == 0 && depth_multiplier == 1) ||
-       (input_depth == 1 && depth_multiplier > 1));
+  bool supported = stride == params.stride_height && stride <= 2 &&
+                   padding <= 1 && filter_width == 3 && filter_height == 3 &&
+                   params.dilation_width_factor == 1 &&
+                   params.dilation_height_factor == 1 &&
+                   (((input_depth % 8) == 0 && depth_multiplier == 1) ||
+                    (input_depth == 1 && depth_multiplier > 1));
 
   if (!supported) {
     return DotProduct3x3KernelType::kNone;
@@ -277,15 +277,11 @@ inline DotProduct3x3KernelType CategorizeDotProductKernel(
   }
 
   if (quantization_type == QuantizationType::kPerChannelInt8) {
-    const int32 output_depth = output_shape.Dims(3);
     if (output_shift_ptr == nullptr) {
       return DotProduct3x3KernelType::kNone;
     }
-    for (int i = 0; i < output_depth; ++i) {
-      if (output_shift_ptr[i] > 0) {
-        return DotProduct3x3KernelType::kNone;
-      }
-    }
+  } else if (params.output_shift > 0) {
+    return DotProduct3x3KernelType::kNone;
   }
 
   if (params.depth_multiplier == 1) {
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 4745003b5ea..642d7577a1b 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -1814,8 +1814,9 @@ inline void DepthwiseConvWithRounding(
   const auto ruy_paths = ruy_context != nullptr
                              ? ruy_context->GetRuntimeEnabledPaths()
                              : ruy::Path::kNone;
+  // TODO(b/150208140): Re-enable once erroneous activation in test is resolved.
   const bool has_dot_product_instructions =
-      (ruy_paths & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+      false && (ruy_paths & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
 
   // Dispatch to dot-product 3x3 kernels when supported.
   if (has_dot_product_instructions) {
@@ -1823,13 +1824,16 @@ inline void DepthwiseConvWithRounding(
     DotProduct3x3KernelType kernel_type =
         optimized_ops::depthwise_conv::CategorizeDotProductKernel<
             optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
-            input_shape, filter_shape, output_shape, params);
+            input_shape, filter_shape, output_shape, params, output_shift);
     if (kernel_type != DotProduct3x3KernelType::kNone) {
       ruy::profiler::ScopeLabel specialized_label(
           "DepthwiseConvInt8/8bit/3x3XDotProduct");
+      DepthwiseParams params_copy = params;
+      params_copy.output_shift_per_channel = output_shift;
+      params_copy.output_multiplier_per_channel = output_multiplier;
       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
           DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
-          params, input_shape, input_data, filter_shape, filter_data,
+          params_copy, input_shape, input_data, filter_shape, filter_data,
           bias_shape, bias_data, output_shape, output_data, thread_start,
           thread_end, thread_dim);
       return;
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index f82926825ed..5db126c1a11 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -57,9 +57,9 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride,
     const float* per_channel_scale, const int32_t* input_offset) {
-  return NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows,
-                          m_cols, vectors, scaling_factors, n_batch, result,
-                          result_stride, per_channel_scale, input_offset);
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, result_stride,
+                   per_channel_scale, input_offset);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -69,10 +69,10 @@ void MatrixBatchVectorMultiplyAccumulate(
     const float* per_channel_scale, const int32_t* input_offset,
     int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
     CpuBackendContext* context) {
-  return NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows,
-                          m_cols, vectors, scaling_factors, n_batch, result,
-                          result_stride, per_channel_scale, input_offset,
-                          scratch, row_sums, compute_row_sums, context);
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, result_stride,
+                   per_channel_scale, input_offset, scratch, row_sums,
+                   compute_row_sums, context);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
@@ -111,6 +111,31 @@ void MatrixBatchVectorMultiplyAccumulate(
                    n_output, output_zp, scratch, output, context);
 }
 
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
                                     int32_t n_row, int32_t n_col,
                                     int32_t* output) {
@@ -127,16 +152,36 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                    n_batch, n_input, output);
 }
 
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output) {
   NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output);
 }
 
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output) {
   NEON_OR_PORTABLE(ApplyTanh, integer_bits, input, n_batch, n_input, output);
 }
 
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int shift, int16_t* output) {
   NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
@@ -167,8 +212,8 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                       const int16_t* vector2, int v_size,
                                       int n_batch, int32_t* result) {
-  return PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size,
-                                                  n_batch, result);
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
 }
 
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
@@ -260,6 +305,19 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturationgAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index a0cbcd2d9bf..fce9aeb1691 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -126,6 +126,31 @@ void MatrixBatchVectorMultiplyAccumulate(
       shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
 }
 
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
                                     int32_t n_row, int32_t n_col,
                                     int32_t* output) {
@@ -141,16 +166,36 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                          output);
 }
 
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output) {
   PortableApplySigmoid(input, n_batch, n_input, output);
 }
 
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
 void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output) {
   PortableApplyTanh(intger_bits, input, n_batch, n_input, output);
 }
 
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int shift, int16_t* output) {
   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
@@ -181,8 +226,8 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                       const int16_t* vector2, int v_size,
                                       int n_batch, int32_t* result) {
-  return PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size,
-                                                  n_batch, result);
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
 }
 
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
@@ -274,6 +319,19 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturationgAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 2fab9a13719..75e60c9cdf0 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -119,6 +119,92 @@ inline void DepthwiseConvPerChannel(
   }
 }
 
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int16* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const std::int64_t* bias_data, const RuntimeShape& output_shape,
+    int16* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            std::int64_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 64 bits accumulator.
+                  // We assume maximum of 2^16 accumulations as with the 8-bit
+                  // case so actually the value in the accumulator should not
+                  // exceed 40 bits
+                  acc += static_cast<int64_t>(filter_val) *
+                         static_cast<int64_t>(input_val);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            int32 scaled_acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            scaled_acc = std::max(scaled_acc, output_activation_min);
+            scaled_acc = std::min(scaled_acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] =
+                static_cast<int16_t>(scaled_acc);
+          }
+        }
+      }
+    }
+  }
+}
+
 inline void DepthwiseConvHybridPerChannel(
     const DepthwiseParams& params, float* scaling_factors_ptr,
     const RuntimeShape& input_shape, const int8* input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index 28f9b2f0994..fd9cb0180e1 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -68,9 +68,7 @@ inline void FullyConnected(
     const int8_t* filter_data, const RuntimeShape& bias_shape,
     const int64_t* bias_data, const RuntimeShape& output_shape,
     int16_t* output_data) {
-  const int32 input_offset = params.input_offset;
   const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
   const int32 output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
   const int32 output_activation_min = params.quantized_activation_min;
@@ -90,14 +88,13 @@ inline void FullyConnected(
       for (int d = 0; d < accum_depth; ++d) {
         int32 input_val = input_data[b * accum_depth + d];
         int32 filter_val = filter_data[out_c * accum_depth + d];
-        acc += (filter_val + filter_offset) * (input_val + input_offset);
+        acc += (filter_val + filter_offset) * input_val;
       }
       if (bias_data) {
         acc += bias_data[out_c];
       }
       int32_t acc_scaled =
           MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-      acc_scaled += output_offset;
       acc_scaled = std::max(acc_scaled, output_activation_min);
       acc_scaled = std::min(acc_scaled, output_activation_max);
       output_data[out_c + output_depth * b] = static_cast<int16_t>(acc_scaled);
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 5d7907b20ef..4fceb905426 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -340,6 +340,74 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       n_output, output_zp, output);
 }
 
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp) {
+  const int32_t int8_max = std::numeric_limits<int8>::max();
+  const int32_t int8_min = std::numeric_limits<int8>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_cell; ++row) {
+      int32_t acc = 0;
+      for (int col = 0; col < n_input; ++col) {
+        int32_t input_val = input[batch * n_input + col];
+        int8_t weights_val = input_to_gate_weights[row * n_input + col];
+        acc += (input_val - input_zeropoint) * weights_val;
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
+                                          input_to_gate_effective_scale_b);
+      acc += gate_output_zp;
+      if (acc > int8_max) {
+        acc = int8_max;
+      }
+      if (acc < int8_min) {
+        acc = int8_min;
+      }
+      gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output) {
+  const int16_t int8_max = std::numeric_limits<int8>::max();
+  const int16_t int8_min = std::numeric_limits<int8>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_output; ++row) {
+      int64_t acc = gate_bias[row];
+      for (int col = 0; col < n_hidden; ++col) {
+        int16_t input_val = hidden[batch * n_hidden + col];
+        int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
+        int64_t curr = acc;
+        acc += input_val * weights_val;
+        if (input_val * weights_val > 0 && acc < curr) {
+          acc = std::numeric_limits<int32>::max();
+        }
+        if (input_val * weights_val < 0 && acc > curr) {
+          acc = std::numeric_limits<int32>::min();
+        }
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
+                                          proj_effective_scale_b);
+      acc += output_zp;
+      if (acc > int8_max) {
+        acc = int8_max;
+      }
+      if (acc < int8_min) {
+        acc = int8_min;
+      }
+      proj_output[batch * n_output + row] = acc;
+    }
+  }
+}
+
 void PortableApplyLayerNorm(const int16_t* input,
                             const int16_t* layer_norm_weights,
                             const int32_t* bias, int32_t layer_norm_scale_a,
@@ -390,6 +458,52 @@ void PortableApplyLayerNorm(const int16_t* input,
   }
 }
 
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  // This is to surpress a lint warning.
+  const double two = 2.0;
+  const float layer_norm_scale =
+      layer_norm_scale_a *
+      std::pow(two, static_cast<double>(layer_norm_scale_b - 31));
+  const float bias_scale = std::pow(two, -10) * layer_norm_scale;
+
+  for (int batch = 0; batch < n_batch; ++batch) {
+    float sum = 0.0f;
+    float sum_sq = 0.0f;
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float value = static_cast<float>(input[index]);
+      sum += value;
+      sum_sq += value * value;
+    }
+    const float mean = sum / n_input;
+    float stddev_inv = 0.0f;
+    const float variance = sum_sq / n_input - mean * mean;
+    if (variance == 0) {
+      stddev_inv = 1.0f / sqrt(1e-8);
+    } else {
+      stddev_inv = 1.0f / sqrt(variance);
+    }
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float normalized_value =
+          (static_cast<float>(input[index]) - mean) * stddev_inv;
+      const float weighted_normalized_value =
+          normalized_value * layer_norm_weights[i] * layer_norm_scale +
+          bias[i] * bias_scale;
+      const int32_t quant_output = static_cast<int32>(
+          std::round(weighted_normalized_value * std::pow(2, 12)));
+      output[index] = std::min(int16_max, std::max(int16_min, quant_output));
+    }
+  }
+}
+
 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
                                             int32_t scalar, int32_t n_row,
                                             int32_t n_col, int32_t* output) {
@@ -416,6 +530,24 @@ void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
   }
 }
 
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float float_input = input[index] * std::pow(2, -12);
+      const float float_output = 1.0f / (1.0f + std::exp(-float_input));
+      const int32_t quant_output =
+          static_cast<int32>(float_output * std::pow(2, 15));
+      const int32_t quant_output_clamped =
+          std::min(int16_max, std::max(int16_min, quant_output));
+      output[index] = static_cast<int16>(quant_output_clamped);
+    }
+  }
+}
+
 template <int IntegerBits>
 void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
                            int32_t n_input, int16_t* output) {
@@ -452,6 +584,27 @@ void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
 #undef DISPATCH_TANH
 }
 
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  const double two = 2.0;
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float float_input =
+          input[index] * std::pow(two, static_cast<double>(integer_bits));
+      const float float_output = std::tanh(float_input);
+      const int32_t quant_output =
+          static_cast<int32>(float_output * std::pow(2, 15));
+      const int32_t quant_output_clamped =
+          std::min(int16_max, std::max(int16_min, quant_output));
+      output[index] = static_cast<int16>(quant_output_clamped);
+    }
+  }
+}
+
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int shift, int16_t* output) {
   for (int batch = 0; batch < n_batch; ++batch) {
@@ -666,5 +819,34 @@ void PortableMeanStddevNormalization(const float* input_vector,
   }
 }
 
+void PortableTwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                                   const int8_t* recurrent, int8_t recurrent_zp,
+                                   int32_t input_effective_scale_a,
+                                   int32_t input_effective_scale_b,
+                                   int32_t recurrent_effective_scale_a,
+                                   int32_t recurrent_effective_scale_b,
+                                   int32_t n_batch, int32_t n_cell,
+                                   int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  for (int i = 0; i < n_batch * n_cell; ++i) {
+    int32_t x = static_cast<int32>(input[i]) - static_cast<int32>(input_zp);
+    int32_t h =
+        static_cast<int32>(recurrent[i]) - static_cast<int32>(recurrent_zp);
+    int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
+                                                     input_effective_scale_b);
+    int32_t h_scaled = MultiplyByQuantizedMultiplier(
+        h, recurrent_effective_scale_a, recurrent_effective_scale_b);
+    int32_t y = h_scaled + x_scaled;
+    if (y > int16_max) {
+      y = int16_max;
+    }
+    if (y < int16_min) {
+      y = int16_min;
+    }
+    output[i] = static_cast<int16_t>(y);
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index f5ae5ee173f..e51e5442c2a 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -41,22 +41,22 @@ bool IsZeroVector(const int8_t* vector, int v_size) {
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float* min, float* max,
                              float* scaling_factor) {
-  return PortableSymmetricQuantizeFloats(values, size, quantized_values, min,
-                                         max, scaling_factor);
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
+                                  scaling_factor);
 }
 
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float min_value,
                              float max_value, float* scaling_factor) {
-  return PortableSymmetricQuantizeFloats(values, size, quantized_values,
-                                         min_value, max_value, scaling_factor);
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
+                                  max_value, scaling_factor);
 }
 
 void AsymmetricQuantizeFloats(const float* values, const int size,
                               int8_t* quantized_values, float* scaling_factor,
                               int32_t* offset) {
-  return PortableAsymmetricQuantizeFloats(values, size, quantized_values,
-                                          scaling_factor, offset);
+  PortableAsymmetricQuantizeFloats(values, size, quantized_values,
+                                   scaling_factor, offset);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
@@ -104,7 +104,7 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride,
     const float* per_channel_scale, const int32_t* input_offset) {
-  return PortableMatrixBatchVectorMultiplyAccumulate(
+  PortableMatrixBatchVectorMultiplyAccumulate(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
       result_stride, per_channel_scale, input_offset);
 }
@@ -152,6 +152,31 @@ void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
   PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
 }
 
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                     const int32_t* bias, int32_t layer_norm_scale_a,
                     int32_t layer_norm_scale_b, int32_t variance_limit,
@@ -161,16 +186,36 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                          output);
 }
 
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output) {
   PortableApplySigmoid(input, n_batch, n_input, output);
 }
 
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output) {
   PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
 }
 
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int shift, int16_t* output) {
   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
@@ -214,8 +259,8 @@ float VectorVectorDotProduct(const float* vector1, const float* vector2,
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                       const int16_t* vector2, int v_size,
                                       int n_batch, int32_t* result) {
-  return PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size,
-                                                  n_batch, result);
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
 }
 
 void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
@@ -265,6 +310,19 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturationgAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index fb86aef1a19..b14d4c5b3f0 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -122,6 +122,21 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
     int32_t* scratch, int8_t* output, CpuBackendContext* context);
 
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp);
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output);
+
 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
                                             int32_t scalar, int32_t n_row,
                                             int32_t n_col, int32_t* output);
@@ -132,12 +147,26 @@ void PortableApplyLayerNorm(const int16_t* input,
                             int32_t layer_norm_scale_b, int32_t variance_limit,
                             int n_batch, int n_input, int16_t* output);
 
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output);
+
 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
                           int32_t n_input, int16_t* output);
 
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output);
+
 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
                        int32_t n_batch, int32_t n_input, int16_t* output);
 
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output);
+
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int shift, int16_t* output);
 
@@ -197,6 +226,16 @@ void PortableMeanStddevNormalization(const float* input_vector,
                                      float* output_vector, int v_size,
                                      int n_batch);
 
+// Saturate Add.
+void PortableTwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                                   const int8_t* recurrent, int8_t recurrent_zp,
+                                   int32_t input_effective_scale_a,
+                                   int32_t input_effective_scale_b,
+                                   int32_t recurrent_effective_scale_a,
+                                   int32_t recurrent_effective_scale_b,
+                                   int32_t n_batch, int32_t n_cell,
+                                   int16_t* output);
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index a939da1448e..58a897cc5fe 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -209,6 +209,27 @@ void MatrixBatchVectorMultiplyAccumulate(
     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
     int32_t* scratch, int8_t* output, CpuBackendContext* context);
 
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp);
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output);
+
 // Multiplies a matrix with a scalar and reduce the result on each row to a
 // scalar.
 // Parameters:
@@ -241,6 +262,13 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                     int32_t layer_norm_scale_b, int32_t variance_limit,
                     int n_batch, int n_input, int16_t* output);
 
+// Same as above but the internal calculation is done in float.
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output);
+
 // Apply Sigmoid to a quantized vector.
 // Parameters:
 //     - input: batch vector of size n_batch * n_input; 16 bit.
@@ -251,6 +279,10 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output);
 
+// Same as above but the internal calcualtion is float.
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output);
+
 // Apply Tanh to a quantized vector.
 // Parameters:
 //     - integer_bits: the integer bits of the input.
@@ -263,6 +295,12 @@ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output);
 
+// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
+//    - Input has 2^(integer_bits) as scale.
+//    - Output has Q0.15 as scale.
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output);
+
 // Element-wise multiplication of two quantized vectors.
 // Parameters:
 //     - input_1: batch vector of size n_batch * n_input; 16 bit.
@@ -553,6 +591,16 @@ void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
 // Layer norm for each batch.
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch);
+
+// Saturate Add with rescale on both inputs.
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output);
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 5eaa0a9aebf..71df1b7468b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -520,6 +520,102 @@ TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_8Test) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Qautnized matmul with 2 * 30 input and 9 * 30 matrix with zero point.
+TEST(uKernels, QuantMatrixBatchVectorMultiply8x8_8WithZPTest) {
+  const int32_t input_zp = 3;
+  const std::vector<int8_t> input = {
+      4,   -41, 5,   -41, 22,  17, -30, 24,  13,  -47, 18, 9,   -11, -30, 16,
+      -47, 12,  36,  -20, 27,  -3, 0,   -51, -31, 3,   -8, -38, 43,  23,  12,
+      11,  -23, -26, 23,  14,  -9, -44, 22,  21,  -30, 3,  -47, -26, -21, -24,
+      -44, 34,  -11, -23, -28, 26, -38, 19,  35,  9,   23, 6,   -42, -25, 28,
+  };
+  const std::vector<int8_t> input_to_gate_weights = {
+      13,  -7,  -20, -22, 8,   -46, 9,   -2,  -18, -42, 40,  28,  -7,  24,  34,
+      -7,  -24, -24, 19,  14,  -19, -6,  -2,  -3,  5,   -36, -13, 6,   -27, 36,
+      -23, 0,   20,  -37, -23, 9,   17,  -41, 33,  -15, -18, -42, -41, -34, -16,
+      -6,  12,  -14, -15, -20, -14, 21,  -3,  -1,  -26, 54,  51,  35,  -14, 9,
+      -2,  13,  -6,  39,  34,  -21, 39,  -51, 19,  -44, 52,  0,   -2,  -38, -35,
+      -33, 4,   -22, -37, 27,  -23, 3,   -10, 5,   32,  6,   1,   -35, 24,  -19,
+      46,  43,  -55, 5,   38,  -14, 32,  -43, -44, -17, -13, -28, 56,  28,  -42,
+      4,   10,  -7,  25,  -15, -9,  -25, -14, -15, 6,   -10, -22, 40,  -72, 18,
+      -6,  -18, -2,  37,  -13, -10, 11,  -9,  32,  -28, 19,  -2,  4,   -31, 50,
+      -15, 23,  -34, -9,  41,  -6,  -34, 17,  2,   24,  -15, 21,  -17, -8,  -20,
+      1,   -63, 19,  -40, 12,  -5,  5,   -6,  1,   19,  -9,  -23, 5,   -34, 11,
+      26,  21,  54,  34,  -43, -29, 1,   16,  31,  -56, -28, 57,  -15, -23, 37,
+      -17, -3,  -6,  29,  18,  77,  17,  -20, -14, -19, 8,   -24, -7,  -45, -3,
+      0,   -25, -8,  6,   9,   3,   -15, 51,  4,   -15, -19, -16, -14, -47, -52,
+      25,  9,   58,  26,  -9,  -27, 49,  -6,  -21, 21,  18,  12,  -9,  -9,  14,
+      31,  -26, -19, -50, 17,  35,  11,  -10, 22,  -16, -43, -2,  26,  55,  -20,
+      -7,  21,  33,  -20, 26,  -15, -22, 30,  27,  3,   -34, 26,  12,  -1,  19,
+      26,  -25, 10,  30,  30,  -14, -23, -23, -35, -16, 26,  -41, 11,  1,   21,
+  };
+  const int32_t multiplier = 1347771520;
+  const int32_t shift = -7;
+  const int32_t output_zp = -11;
+
+  std::vector<int8_t> output = {1, 2, 3, 4, 5,  6,  5,  4,  3,
+                                2, 1, 2, 8, -1, -2, 11, 17, 18};
+
+  MatrixBatchVectorMultiply(
+      input.data(), input_zp, input_to_gate_weights.data(), multiplier, shift,
+      /*n_batch=*/2, /*n_input=*/30, /*n_cell=*/9, output.data(), output_zp);
+  const std::vector<int8_t> expected_output = {6,   -9,  -4, -32, -10, -17,
+                                               -25, -25, 14, -19, 3,   10,
+                                               -12, 10,  0,  1,   -57, -41};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Qautnized matmul with 2 * 30 input and 9 * 30 matrix with zero point.
+TEST(uKernels, QuantMatrixBatchVectorMultiply16x8_8WithZPTest) {
+  const std::vector<int16_t> input = {
+      400, -41, 5,   -41, 22,  17, -30, 24,  130, -47, 18, 9,   -11, -30, 16,
+      -47, 12,  36,  -20, 27,  -3, 0,   -51, -31, 3,   -8, -38, 43,  23,  12,
+      11,  -23, -26, 23,  14,  -9, -44, 22,  21,  -30, 3,  -47, -26, -21, -24,
+      -44, 34,  -11, -23, -28, 26, -38, 19,  35,  9,   23, 6,   -42, -25, 28,
+  };
+  const std::vector<int8_t> input_to_gate_weights = {
+      13,  -7,  -20, -22, 8,   -46, 9,   -2,  -18, -42, 40,  28,  -7,  24,  34,
+      -7,  -24, -24, 19,  14,  -19, -6,  -2,  -3,  5,   -36, -13, 6,   -27, 36,
+      -23, 0,   20,  -37, -23, 9,   17,  -41, 33,  -15, -18, -42, -41, -34, -16,
+      -6,  12,  -14, -15, -20, -14, 21,  -3,  -1,  -26, 54,  51,  35,  -14, 9,
+      -2,  13,  -6,  39,  34,  -21, 39,  -51, 19,  -44, 52,  0,   -2,  -38, -35,
+      -33, 4,   -22, -37, 27,  -23, 3,   -10, 5,   32,  6,   1,   -35, 24,  -19,
+      46,  43,  -55, 5,   38,  -14, 32,  -43, -44, -17, -13, -28, 56,  28,  -42,
+      4,   10,  -7,  25,  -15, -9,  -25, -14, -15, 6,   -10, -22, 40,  -72, 18,
+      -6,  -18, -2,  37,  -13, -10, 11,  -9,  32,  -28, 19,  -2,  4,   -31, 50,
+      -15, 23,  -34, -9,  41,  -6,  -34, 17,  2,   24,  -15, 21,  -17, -8,  -20,
+      1,   -63, 19,  -40, 12,  -5,  5,   -6,  1,   19,  -9,  -23, 5,   -34, 11,
+      26,  21,  54,  34,  -43, -29, 1,   16,  31,  -56, -28, 57,  -15, -23, 37,
+      -17, -3,  -6,  29,  18,  77,  17,  -20, -14, -19, 8,   -24, -7,  -45, -3,
+      0,   -25, -8,  6,   9,   3,   -15, 51,  4,   -15, -19, -16, -14, -47, -52,
+      25,  9,   58,  26,  -9,  -27, 49,  -6,  -21, 21,  18,  12,  -9,  -9,  14,
+      31,  -26, -19, -50, 17,  35,  11,  -10, 22,  -16, -43, -2,  26,  55,  -20,
+      -7,  21,  33,  -20, 26,  -15, -22, 30,  27,  3,   -34, 26,  12,  -1,  19,
+      26,  -25, 10,  30,  30,  -14, -23, -23, -35, -16, 26,  -41, 11,  1,   21,
+  };
+
+  const std::vector<int32_t> input_zeropoint_times_weights = {
+      0, 2, 3, 4, 5, 4, 3, 2, 10,
+  };
+  const int32_t multiplier = 1347771520;
+  const int32_t shift = -8;
+  const int32_t output_zp = -11;
+
+  std::vector<int8_t> output = {1, 2, 3, 4, 5,  6,  5,  4,  3,
+                                2, 1, 2, 8, -1, -2, 11, 17, 18};
+
+  MatrixBatchVectorMultiply(
+      input.data(), input_to_gate_weights.data(), multiplier, shift,
+      input_zeropoint_times_weights.data(),
+      /*n_batch=*/2, /*n_hidden=*/30, /*n_output=*/9, output_zp, output.data());
+  const std::vector<int8_t> expected_output = {4,   -24, -5, 10,  -7,  -13,
+                                               -39, 2,   3,  -16, -5,  -1,
+                                               -12, -1,  -6, -6,  -33, -25};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized matmul with 9 * 30 matrix.
 TEST(uKernels, MatrixScalarMultiplyAccumulateTest) {
   std::vector<int32_t> output = {
@@ -585,6 +681,37 @@ TEST(uKernels, QuantApplyLayerNormTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Quantized layer norm of n_batch = 2 and n_input = 15.
+TEST(uKernels, QuantApplyLayerNormFloatTest) {
+  const std::vector<int16_t> input = {
+      -310,  596,   34,   -68,  475,  92,  672, -54,  -913, -200,
+      -1194, -836,  -620, -237, 991,  533, 721, -736, -8,   -941,
+      -372,  -1084, 591,  2557, -779, 175, 582, 956,  -287, 944,
+  };
+  const std::vector<int16_t> layer_norm_weights = {
+      21849, 22882, 20626, 23854, 24779, 26354, 12980, 26231,
+      23716, 27271, 24937, 22647, 24715, 22854, 19646,
+  };
+  const std::vector<int32_t> bias_weight = {
+      -14175520, -13805465, -16027609, -13786809, -13321033,
+      -14399810, -15055368, -14536623, -14508746, -13784007,
+      -15206609, -15125830, -14996304, -14847597, -12814379,
+  };
+  const int32_t multiplier = 1895840000;
+  const int32_t shift = -13;
+
+  std::vector<int16_t> output(2 * 15, 0);
+  ApplyLayerNormFloat(input.data(), layer_norm_weights.data(), multiplier,
+                      shift, bias_weight.data(), 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      -9408,  5844,   -4803,  -5297,  4826,   -2392,  927,   -5286,
+      -20353, -7851,  -26534, -18701, -15830, -8623,  10312, -2524,
+      -136,   -16053, -8206,  -19160, -13299, -14407, -1233, 20617,
+      -18594, -6736,  -2272,  2597,   -11620, 1566};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized tanh with Q0.15 input and Q0.15 output.
 TEST(uKernels, QuantTanh0Test) {
   const std::vector<int16_t> input = {
@@ -631,6 +758,29 @@ TEST(uKernels, QuantTanh3Test) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Quantized tanh with float calculation.
+TEST(uKernels, QuantTanhFloatTest) {
+  const std::vector<int16_t> input = {
+      -1,   0,   1,    -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+      -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+  };
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplyTanhFloat(input.data(), 4, 15, -12, output.data());
+  const std::vector<int16_t> expected_output = {
+      -8,    0,    8,     -279, 2109, 2308,  63,    215,   -295,  -10136,
+      -959,  1015, -127,  847,  2951, -4632, -2387, 743,   -4358, 4358,
+      5180,  -231, -423,  8280, -415, -1311, -1191, -5039, 1606,  -10042,
+      -1159, 7078, -1407, -279, 2109, 2308,  63,    215,   -295,  -10136,
+      -959,  1015, -127,  847,  2951, -4632, -2387, 743,   -4358, 4358,
+      5180,  -231, -423,  8280, -415, -1311, -1191, -5039, 1606,  -10042};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized tanh with Q4.11 input and Q0.15 output.
 TEST(uKernels, QuantTanh4Test) {
   const std::vector<int16_t> input = {
@@ -676,6 +826,30 @@ TEST(uKernels, QuantSigmoidTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Quantized sigmoid with Q3.12 input and Q0.15 output.
+TEST(uKernels, QuantSigmoidFloatTest) {
+  const std::vector<int16_t> input = {
+      -10500, 1398,   -6963,  -7404,  485,    -5401,  -1757,  -7668,  -19248,
+      -9692,  -24249, -17923, -15840, -10026, 5249,   -89,    1787,   -16178,
+      -6691,  -19524, -13439, -24048, -1123,  32767,  -17267, -3378,  823,
+      11482,  -11139, 7508,   -10500, 1398,   -6963,  -7404,  485,    -5401,
+      -1757,  -7668,  -19248, -9692,  -24249, -17923, -15840, -10026, 5249,
+      -89,    1787,   -16178, -6691,  -19524, -13439, -24048, -1123,  32767,
+      -17267, -3378,  823,    11482,  -11139, 7508,
+  };
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplySigmoidFloat(input.data(), 4, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      2343, 19153, 5061,  4617,  17352, 6915,  12922, 4368,  295,  2811,
+      87,   407,   671,   2608,  25647, 16206, 19902, 619,   5352, 276,
+      1187, 92,    14151, 32757, 476,   9986,  18024, 30895, 2026, 28249,
+      2343, 19153, 5061,  4617,  17352, 6915,  12922, 4368,  295,  2811,
+      87,   407,   671,   2608,  25647, 16206, 19902, 619,   5352, 276,
+      1187, 92,    14151, 32757, 476,   9986,  18024, 30895, 2026, 28249};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized Multiply with 16bit output and 15 bit shift.
 TEST(uKernels, QuantMul16bitOut15ShiftTest) {
   const std::vector<int16_t> input1 = {
@@ -1745,6 +1919,33 @@ TEST(uKernels, ReductionSumVectorIntegerTest) {
   EXPECT_THAT(result1, testing::ElementsAreArray({3, 6, -1, 3, 15}));
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output);
+
+TEST(uKernels, TwoGateSaturateAddTest) {
+  const std::vector<int8_t> input1 = {1, 2, 3, 4, 55, 66, 77};
+  const std::vector<int8_t> input2 = {100, 2, 3, 4, 55, 66, 77};
+  const int32_t input1_zp = 10;
+  const int32_t input2_zp = -5;
+  const int32_t multiplier1 = 1347771520;
+  const int32_t shift1 = -7;
+  const int32_t multiplier2 = 1047577121;
+  const int32_t shift2 = -6;
+  std::vector<int16_t> output(7);
+
+  TwoGateSaturationgAdd(input1.data(), input1_zp, input2.data(), input2_zp,
+                        multiplier1, shift1, multiplier2, shift2, 1, 7,
+                        output.data());
+
+  const std::vector<int16_t> expected_output = {1, 0, 0, 0, 0, 1, 1};
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 namespace {
 // Parameterized test: mean, difference, tolerance.
 // Input is constructed as [mean-2*diff, mean-diff, mean+diff, mean+2*diff]
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 217cf5cdee0..fceea866fca 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -59,7 +59,8 @@ struct OpData {
 
 namespace full {
 namespace {
-TfLiteStatus PopulateQuantizedLstmParams(
+
+TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     TfLiteContext* context, TfLiteNode* node,
     lstm_eval::IntegerLstmParameter* integer_lstm_param) {
   // Calculate quantized clip for projection and cell.
@@ -366,6 +367,361 @@ TfLiteStatus PopulateQuantizedLstmParams(
   return kTfLiteOk;
 }
 
+TfLiteStatus PopulateQuantizedLstmParams8x8_8(
+    TfLiteContext* context, TfLiteNode* node,
+    lstm_eval::IntegerLstmParameter* integer_lstm_param) {
+  // Get all tensors.
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kInputLayerNormCoefficientsTensor);
+  const TfLiteTensor* forget_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kForgetLayerNormCoefficientsTensor);
+  const TfLiteTensor* cell_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kCellLayerNormCoefficientsTensor);
+  const TfLiteTensor* output_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kOutputLayerNormCoefficientsTensor);
+
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  TfLiteTensor* activation_state =
+      GetVariableInput(context, node, kInputActivationStateTensor);
+  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* cell_state =
+      GetVariableInput(context, node, kInputCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool is_layer_norm_lstm = (forget_layer_norm_coefficients != nullptr);
+  const bool use_projection = (projection_weights != nullptr);
+
+  // Weights and states.
+  int8_t* input_to_input_weight_ptr = nullptr;
+  int8_t* recurrent_to_input_weight_ptr = nullptr;
+  int8_t* cell_to_input_weight_ptr = nullptr;
+  int8_t* input_to_forget_weight_ptr = nullptr;
+  int8_t* recurrent_to_forget_weight_ptr = nullptr;
+  int8_t* cell_to_forget_weight_ptr = nullptr;
+  int8_t* input_to_cell_weight_ptr = nullptr;
+  int8_t* recurrent_to_cell_weight_ptr = nullptr;
+  int8_t* input_to_output_weight_ptr = nullptr;
+  int8_t* recurrent_to_output_weight_ptr = nullptr;
+  int8_t* cell_to_output_weight_ptr = nullptr;
+  int8_t* proj_weight_ptr = nullptr;
+  int16_t* layer_norm_input_weight_ptr = nullptr;
+  int16_t* layer_norm_forget_weight_ptr = nullptr;
+  int16_t* layer_norm_cell_weight_ptr = nullptr;
+  int16_t* layer_norm_output_weight_ptr = nullptr;
+  int32_t* input_bias_ptr = nullptr;
+  int32_t* forget_bias_ptr = nullptr;
+  int32_t* cell_bias_ptr = nullptr;
+  int32_t* output_bias_ptr = nullptr;
+  int32_t* proj_bias_ptr = nullptr;
+  int16_t* cell_ptr = nullptr;
+  int8_t* activation_ptr = nullptr;
+
+  // Scales.
+  const float default_scale = 1.0;
+  float input_scale = default_scale;
+  float input_to_input_weight_scale = default_scale;
+  float recurrent_to_input_weight_scale = default_scale;
+  float cell_to_input_weight_scale = default_scale;
+  float input_to_forget_weight_scale = default_scale;
+  float recurrent_to_forget_weight_scale = default_scale;
+  float cell_to_forget_weight_scale = default_scale;
+  float input_to_cell_weight_scale = default_scale;
+  float recurrent_to_cell_weight_scale = default_scale;
+  float input_to_output_weight_scale = default_scale;
+  float recurrent_to_output_weight_scale = default_scale;
+  float cell_to_output_weight_scale = default_scale;
+  float proj_weight_scale = default_scale;
+  float layer_norm_input_scale = default_scale;
+  float layer_norm_forget_scale = default_scale;
+  float layer_norm_cell_scale = default_scale;
+  float layer_norm_output_scale = default_scale;
+  float activation_scale = default_scale;
+
+  // Effective scales.
+  float effective_input_to_input_scale = default_scale;
+  float effective_recurrent_to_input_scale = default_scale;
+  float effective_cell_to_input_scale = default_scale;
+  float effective_input_to_forget_scale = default_scale;
+  float effective_recurrent_to_forget_scale = default_scale;
+  float effective_cell_to_forget_scale = default_scale;
+  float effective_input_to_cell_scale = default_scale;
+  float effective_recurrent_to_cell_scale = default_scale;
+  float effective_input_to_output_scale = default_scale;
+  float effective_recurrent_to_output_scale = default_scale;
+  float effective_cell_to_output_scale = default_scale;
+  float effective_proj_scale = default_scale;
+
+  // Zero points
+  int input_zp = 0;
+  int activation_zp = 0;
+
+  // Populate all the values.
+  if (!use_cifg) {
+    input_to_input_weight_ptr = input_to_input_weights->data.int8;
+    recurrent_to_input_weight_ptr = recurrent_to_input_weights->data.int8;
+    input_bias_ptr = input_gate_bias->data.i32;
+    input_to_input_weight_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weight_ptr = cell_to_input_weights->data.int8;
+      cell_to_input_weight_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weight_ptr = cell_to_forget_weights->data.int8;
+    cell_to_output_weight_ptr = cell_to_output_weights->data.int8;
+    cell_to_forget_weight_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weight_scale = cell_to_output_weights->params.scale;
+  }
+
+  if (is_layer_norm_lstm) {
+    if (!use_cifg) {
+      layer_norm_input_weight_ptr = input_layer_norm_coefficients->data.i16;
+      layer_norm_input_scale = input_layer_norm_coefficients->params.scale;
+    }
+    layer_norm_forget_weight_ptr = forget_layer_norm_coefficients->data.i16;
+    layer_norm_forget_scale = forget_layer_norm_coefficients->params.scale;
+    layer_norm_cell_weight_ptr = cell_layer_norm_coefficients->data.i16;
+    layer_norm_cell_scale = cell_layer_norm_coefficients->params.scale;
+    layer_norm_output_weight_ptr = output_layer_norm_coefficients->data.i16;
+    layer_norm_output_scale = output_layer_norm_coefficients->params.scale;
+  }
+
+  if (use_projection) {
+    proj_weight_ptr = projection_weights->data.int8;
+    proj_weight_scale = projection_weights->params.scale;
+    if (projection_bias) {
+      proj_bias_ptr = projection_bias->data.i32;
+    }
+  }
+  activation_scale = activation_state->params.scale;
+
+  input_to_forget_weight_ptr = input_to_forget_weights->data.int8;
+  input_to_forget_weight_scale = input_to_forget_weights->params.scale;
+  input_to_cell_weight_ptr = input_to_cell_weights->data.int8;
+  input_to_cell_weight_scale = input_to_cell_weights->params.scale;
+  input_to_output_weight_ptr = input_to_output_weights->data.int8;
+  input_to_output_weight_scale = input_to_output_weights->params.scale;
+  recurrent_to_forget_weight_ptr = recurrent_to_forget_weights->data.int8;
+  recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale;
+  recurrent_to_cell_weight_ptr = recurrent_to_cell_weights->data.int8;
+  recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
+  recurrent_to_output_weight_ptr = recurrent_to_output_weights->data.int8;
+  recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
+  forget_bias_ptr = forget_gate_bias->data.i32;
+  cell_bias_ptr = cell_bias->data.i32;
+  output_bias_ptr = output_gate_bias->data.i32;
+  activation_ptr = activation_state->data.int8;
+  cell_ptr = cell_state->data.i16;
+  input_scale = input->params.scale;
+  input_zp = input->params.zero_point;
+  activation_zp = activation_state->params.zero_point;
+
+  std::vector<float> intermediate_scale;
+  for (int i = 0; i < 12; ++i) {
+    TfLiteTensor* intermediate =
+        &context->tensors[node->intermediates->data[i]];
+    auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+        intermediate->quantization.params);
+    intermediate_scale.push_back(params->scale->data[0]);
+    integer_lstm_param->intermediate_zp[i] = params->zero_point->data[0];
+  }
+
+  // Calculate effective scales.
+  if (!use_cifg) {
+    effective_input_to_input_scale =
+        input_to_input_weight_scale * input_scale / intermediate_scale[1];
+    effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
+                                         activation_scale /
+                                         intermediate_scale[2];
+  }
+  effective_input_to_forget_scale =
+      input_to_forget_weight_scale * input_scale / intermediate_scale[4];
+  effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
+                                        activation_scale /
+                                        intermediate_scale[5];
+
+  effective_input_to_cell_scale =
+      input_to_cell_weight_scale * input_scale / intermediate_scale[7];
+  effective_recurrent_to_cell_scale =
+      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[8];
+
+  effective_input_to_output_scale =
+      input_to_output_weight_scale * input_scale / intermediate_scale[10];
+  effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
+                                        activation_scale /
+                                        intermediate_scale[11];
+  effective_proj_scale =
+      proj_weight_scale * std::pow(2, -15) / activation_scale;
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      effective_cell_to_input_scale =
+          std::pow(2, -15) * cell_to_input_weight_scale / intermediate_scale[0];
+    }
+    effective_cell_to_forget_scale =
+        std::pow(2, -15) * cell_to_forget_weight_scale / intermediate_scale[3];
+    effective_cell_to_output_scale =
+        std::pow(2, -15) * cell_to_output_weight_scale / intermediate_scale[9];
+  }
+
+  // Calculate effecgive scales.
+  QuantizeMultiplier(effective_input_to_input_scale,
+                     &integer_lstm_param->effective_input_to_input_scale_a,
+                     &integer_lstm_param->effective_input_to_input_scale_b);
+  QuantizeMultiplier(effective_recurrent_to_input_scale,
+                     &integer_lstm_param->effective_recurrent_to_input_scale_a,
+                     &integer_lstm_param->effective_recurrent_to_input_scale_b);
+  QuantizeMultiplier(effective_cell_to_input_scale,
+                     &integer_lstm_param->effective_cell_to_input_scale_a,
+                     &integer_lstm_param->effective_cell_to_input_scale_b);
+  QuantizeMultiplier(effective_input_to_forget_scale,
+                     &integer_lstm_param->effective_input_to_forget_scale_a,
+                     &integer_lstm_param->effective_input_to_forget_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_forget_scale,
+      &integer_lstm_param->effective_recurrent_to_forget_scale_a,
+      &integer_lstm_param->effective_recurrent_to_forget_scale_b);
+  QuantizeMultiplier(effective_cell_to_forget_scale,
+                     &integer_lstm_param->effective_cell_to_forget_scale_a,
+                     &integer_lstm_param->effective_cell_to_forget_scale_b);
+  QuantizeMultiplier(effective_input_to_cell_scale,
+                     &integer_lstm_param->effective_input_to_cell_scale_a,
+                     &integer_lstm_param->effective_input_to_cell_scale_b);
+  QuantizeMultiplier(effective_recurrent_to_cell_scale,
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_a,
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_b);
+  QuantizeMultiplier(effective_input_to_output_scale,
+                     &integer_lstm_param->effective_input_to_output_scale_a,
+                     &integer_lstm_param->effective_input_to_output_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_output_scale,
+      &integer_lstm_param->effective_recurrent_to_output_scale_a,
+      &integer_lstm_param->effective_recurrent_to_output_scale_b);
+  QuantizeMultiplier(effective_cell_to_output_scale,
+                     &integer_lstm_param->effective_cell_to_output_scale_a,
+                     &integer_lstm_param->effective_cell_to_output_scale_b);
+  QuantizeMultiplier(effective_proj_scale,
+                     &integer_lstm_param->effective_proj_scale_a,
+                     &integer_lstm_param->effective_proj_scale_b);
+  QuantizeMultiplier(layer_norm_input_scale,
+                     &integer_lstm_param->layer_norm_input_scale_a,
+                     &integer_lstm_param->layer_norm_input_scale_b);
+  QuantizeMultiplier(layer_norm_forget_scale,
+                     &integer_lstm_param->layer_norm_forget_scale_a,
+                     &integer_lstm_param->layer_norm_forget_scale_b);
+  QuantizeMultiplier(layer_norm_cell_scale,
+                     &integer_lstm_param->layer_norm_cell_scale_a,
+                     &integer_lstm_param->layer_norm_cell_scale_b);
+  QuantizeMultiplier(layer_norm_output_scale,
+                     &integer_lstm_param->layer_norm_output_scale_a,
+                     &integer_lstm_param->layer_norm_output_scale_b);
+
+  {
+    // Intermdiates in flatbuffer holds Wx, Wh and Wx+Wh.
+    // effective Wx, Wh is in effective_input/recurrent_to_<...>_scale
+    // So use intermediate_scale to hold scale from Wx and Wh to Wx+Wh
+    // 0: [1] -> [0]
+    // 1: [2] -> [0]
+    // and use intermdiate_zp as is.
+    const float s_1_0 = intermediate_scale[1] / intermediate_scale[0];
+    const float s_2_0 = intermediate_scale[2] / intermediate_scale[0];
+    const float s_4_3 = intermediate_scale[4] / intermediate_scale[3];
+    const float s_5_3 = intermediate_scale[5] / intermediate_scale[3];
+    const float s_7_6 = intermediate_scale[7] / intermediate_scale[6];
+    const float s_8_6 = intermediate_scale[8] / intermediate_scale[6];
+    const float s_10_9 = intermediate_scale[10] / intermediate_scale[9];
+    const float s_11_9 = intermediate_scale[11] / intermediate_scale[9];
+    QuantizeMultiplier(s_1_0, &integer_lstm_param->intermediate_scale_a[0],
+                       &integer_lstm_param->intermediate_scale_b[0]);
+    QuantizeMultiplier(s_2_0, &integer_lstm_param->intermediate_scale_a[1],
+                       &integer_lstm_param->intermediate_scale_b[1]);
+    QuantizeMultiplier(s_4_3, &integer_lstm_param->intermediate_scale_a[2],
+                       &integer_lstm_param->intermediate_scale_b[2]);
+    QuantizeMultiplier(s_5_3, &integer_lstm_param->intermediate_scale_a[3],
+                       &integer_lstm_param->intermediate_scale_b[3]);
+    QuantizeMultiplier(s_7_6, &integer_lstm_param->intermediate_scale_a[4],
+                       &integer_lstm_param->intermediate_scale_b[4]);
+    QuantizeMultiplier(s_8_6, &integer_lstm_param->intermediate_scale_a[5],
+                       &integer_lstm_param->intermediate_scale_b[5]);
+    QuantizeMultiplier(s_10_9, &integer_lstm_param->intermediate_scale_a[6],
+                       &integer_lstm_param->intermediate_scale_b[6]);
+    QuantizeMultiplier(s_11_9, &integer_lstm_param->intermediate_scale_a[7],
+                       &integer_lstm_param->intermediate_scale_b[7]);
+  }
+
+  // Calculate quantized clip for projection and cell.
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const float cell_clip = params->cell_clip;
+  const float proj_clip = params->proj_clip;
+
+  const TfLiteTensor* cell_tensor =
+      GetInput(context, node, kInputCellStateTensor);
+  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+
+  auto* cell_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      cell_tensor->quantization.params);
+  auto* proj_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      output_tensor->quantization.params);
+  TF_LITE_ENSURE_EQ(context, cell_params->scale->data[0], 1.0 / 32768);
+  if (cell_clip > 0.0 && cell_clip < 1.0) {
+    integer_lstm_param->quantized_cell_clip =
+        static_cast<int>(cell_clip / cell_params->scale->data[0]);
+  } else {
+    integer_lstm_param->quantized_cell_clip = 0;
+  }
+  if (proj_clip > 0.0) {
+    integer_lstm_param->quantized_proj_clip =
+        proj_clip / proj_params->scale->data[0];
+  } else {
+    integer_lstm_param->quantized_proj_clip = 0;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -868,11 +1224,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // The weights are of consistent type, so it suffices to check one.
   const bool is_hybrid_op = IsHybridOp(input, input_to_output_weights);
 
+  // The type of Integer LSTM.
+  const int num_intermediate_tensors = node->intermediates->size;
+  if (is_integer) {
+    TF_LITE_ENSURE(context, num_intermediate_tensors == 5 ||
+                                num_intermediate_tensors == 12);
+  }
+  // We use number of intermediate tensors to distinguish the 8 bit matmul
+  // output and the 16 bit matmul output version.
+  const bool is_8x8_16 = num_intermediate_tensors == 5;
+
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
     node->temporaries = TfLiteIntArrayCreate(8);
   } else if (is_integer) {
-    node->temporaries = TfLiteIntArrayCreate(6);
+    if (is_8x8_16) {
+      node->temporaries = TfLiteIntArrayCreate(6);
+    } else {
+      node->temporaries = TfLiteIntArrayCreate(8);
+    }
   } else {
     node->temporaries = TfLiteIntArrayCreate(1);
   }
@@ -1003,42 +1373,78 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (is_integer) {
-    // Populate quantization parameters.
-    PopulateQuantizedLstmParams(context, node, &op_data->integer_lstm_param);
+    if (is_8x8_16) {
+      // Integer LSTM prepare function for 8x8->16.
+      // This code path needs 5 intermediate tensors per Op.
+      // Populate quantization parameters.
+      PopulateQuantizedLstmParams8x8_16(context, node,
+                                        &op_data->integer_lstm_param);
 
-    // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
-    // and 1 8bit buffer with size n_batch * n_cell. We also need 1 32 bit
-    // buffer with size n_batch * n_cell.
-    //
-    // TODO(jianlijianli): Handle cifg case as well, which might save one
-    // buffer.
-    for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
-      node->temporaries->data[scratch_index] =
-          op_data->scratch_tensor_index + scratch_index;
-      TfLiteTensor* scratch_tensor =
-          GetTemporary(context, node, /*index=*/scratch_index);
-      scratch_tensor->type = kTfLiteInt16;
-      if (scratch_index == 4) {
-        scratch_tensor->type = kTfLiteInt8;
-      } else if (scratch_index == 5) {
-        scratch_tensor->type = kTfLiteInt32;
+      // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
+      // and 1 8bit buffer with size n_batch * n_cell. We also need 1 32 bit
+      // buffer with size n_batch * n_cell.
+      //
+      // Handle cifg case as well, which might save one buffer.
+      for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
+        node->temporaries->data[scratch_index] =
+            op_data->scratch_tensor_index + scratch_index;
+        TfLiteTensor* scratch_tensor =
+            GetTemporary(context, node, /*index=*/scratch_index);
+        scratch_tensor->type = kTfLiteInt16;
+        if (scratch_index == 4) {
+          scratch_tensor->type = kTfLiteInt8;
+        } else if (scratch_index == 5) {
+          scratch_tensor->type = kTfLiteInt32;
+        }
+        scratch_tensor->allocation_type = kTfLiteArenaRw;
+        const int scratch_dimension[2] = {n_batch, n_cell};
+        if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
+                                       scratch_dimension)) {
+          TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+          scratch_buffer_size->data[0] = n_batch;
+          scratch_buffer_size->data[1] = n_cell;
+          TF_LITE_ENSURE_OK(context,
+                            context->ResizeTensor(context, scratch_tensor,
+                                                  scratch_buffer_size));
+        }
       }
-      scratch_tensor->allocation_type = kTfLiteArenaRw;
-      const int scratch_dimension[2] = {n_batch, n_cell};
-      if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
-                                     scratch_dimension)) {
-        TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-        scratch_buffer_size->data[0] = n_batch;
-        scratch_buffer_size->data[1] = n_cell;
-        TF_LITE_ENSURE_OK(context,
-                          context->ResizeTensor(context, scratch_tensor,
-                                                scratch_buffer_size));
+
+      // Populate precomputed zp * weight.
+      TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
+                                     context, op_data, node));
+    } else {
+      // Integer LSTM prepare function for 8x8->8.
+      // This code path needs 12 intermediate tensors per Op.
+      PopulateQuantizedLstmParams8x8_8(context, node,
+                                       &op_data->integer_lstm_param);
+
+      // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
+      // and 2 8bit buffer with size n_batch * n_cell.
+      //
+      // Handle cifg case as well, which might save one buffer.
+      for (int scratch_index = 0; scratch_index < 8; ++scratch_index) {
+        node->temporaries->data[scratch_index] =
+            op_data->scratch_tensor_index + scratch_index;
+        TfLiteTensor* scratch_tensor =
+            GetTemporary(context, node, /*index=*/scratch_index);
+        if (scratch_index == 0 || scratch_index == 1) {
+          scratch_tensor->type = kTfLiteInt8;
+        } else {
+          scratch_tensor->type = kTfLiteInt16;
+        }
+        scratch_tensor->allocation_type = kTfLiteArenaRw;
+        const int scratch_dimension[2] = {n_batch, n_cell};
+        if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
+                                       scratch_dimension)) {
+          TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+          scratch_buffer_size->data[0] = n_batch;
+          scratch_buffer_size->data[1] = n_cell;
+          TF_LITE_ENSURE_OK(context,
+                            context->ResizeTensor(context, scratch_tensor,
+                                                  scratch_buffer_size));
+        }
       }
     }
-
-    // Populate precomputed zp * weight.
-    TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
-                                   context, op_data, node));
   }
   return kTfLiteOk;
 }
@@ -1174,26 +1580,51 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             output_scratch_buffer, output,
             CpuBackendContext::GetFromContext(context));
       } else {
-        TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
-        TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
-        TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
-        TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
-        TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
-        TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
-        return lstm_eval::EvalInteger8x8_16(
-            input, input_to_input_weights, input_to_forget_weights,
-            input_to_cell_weights, input_to_output_weights,
-            recurrent_to_input_weights, recurrent_to_forget_weights,
-            recurrent_to_cell_weights, recurrent_to_output_weights,
-            cell_to_input_weights, cell_to_forget_weights,
-            cell_to_output_weights, input_layer_norm_coefficients,
-            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
-            output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
-            cell_bias, output_gate_bias, projection_weights, projection_bias,
-            params, &op_data->integer_lstm_param, activation_state, cell_state,
-            output, scratch0, scratch1, scratch2, scratch3, scratch4, scratch5,
-            CpuBackendContext::GetFromContext(context));
-        return kTfLiteOk;
+        const int num_intermediate_tensors = node->intermediates->size;
+        if (num_intermediate_tensors == 5) {
+          TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
+          TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
+          TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
+          TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
+          TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
+          TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
+          return lstm_eval::EvalInteger8x8_16(
+              input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights,
+              recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights,
+              cell_to_input_weights, cell_to_forget_weights,
+              cell_to_output_weights, input_layer_norm_coefficients,
+              forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+              output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+              cell_bias, output_gate_bias, projection_weights, projection_bias,
+              params, &op_data->integer_lstm_param, activation_state,
+              cell_state, output, scratch0, scratch1, scratch2, scratch3,
+              scratch4, scratch5, CpuBackendContext::GetFromContext(context));
+        } else {
+          TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
+          TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
+          TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
+          TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
+          TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
+          TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
+          TfLiteTensor* scratch6 = GetTemporary(context, node, /*index=*/6);
+          TfLiteTensor* scratch7 = GetTemporary(context, node, /*index=*/7);
+          return lstm_eval::EvalInteger8x8_8(
+              input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights,
+              recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights,
+              cell_to_input_weights, cell_to_forget_weights,
+              cell_to_output_weights, input_layer_norm_coefficients,
+              forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+              output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+              cell_bias, output_gate_bias, projection_weights, projection_bias,
+              params, activation_state, cell_state, output,
+              &op_data->integer_lstm_param, scratch0, scratch1, scratch2,
+              scratch3, scratch4, scratch5, scratch6, scratch7);
+          return kTfLiteOk;
+        }
       }
     }
     default:
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 3670b25f5d7..5691a7df8a5 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -816,7 +816,7 @@ inline void LstmStepHybrid(
   }
 }
 
-// Fully quantized lstm kernel. Currently supports both cifg and non-cifg.
+// Fully quantized lstm kernel for 16 bit gate matmul output.
 //
 // Input activation of size n_batch * n_input:
 //   input_ptr
@@ -895,7 +895,7 @@ inline void LstmStepHybrid(
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
 // n_batch.
-//   scratch_0:
+//   scratch_0
 //   scratch_1
 //   scratch_2
 //   scratch_3
@@ -1142,6 +1142,272 @@ inline void LstmStepInteger(
   std::copy_n(output_ptr, n_batch * n_output, activation_ptr);
 }
 
+// Fully quantized lstm kernel for 8 bit gate matmul output.
+//
+// Input activation of size n_batch * n_input:
+//   input_ptr
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weight_ptr            - optional
+//   input_to_forget_weight_ptr           - optional
+//   input_to_cell_weight_ptr             - optional
+//   input_to_output_weight_ptr           - optional
+//
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weight_ptr        - optional
+//   recurrent_to_forget_weights_ptr
+//   recurrent_to_cell_weights_ptr
+//   recurrent_to_input_weights_ptr
+//
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights               - optional
+//   cell_to_cell_weights                - optional
+//   cell_to_output_weights              - optional
+//
+// Quantized projection weights of size 'n_output * n_cell'
+//   proj_weight_ptr                     - optional
+//
+// Weight scales (scalars) for each of the weights above.
+//   effective_input_to_input_scale_a    - optional
+//   effective_input_to_input_scale_b    - optional
+//   effective_input_to_forget_scale_a
+//   effective_input_to_forget_scale_b
+//   effective_input_to_cell_scale_a
+//   effective_input_to_cell_scale_b
+//   effective_input_to_output_scale_a
+//   effective_input_to_output_scale_b
+//   effective_recurrent_to_input_scale_a    - optional
+//   effective_recurrent_to_input_scale_b    - optional
+//   effective_recurrent_to_forget_scale_a
+//   effective_recurrent_to_forget_scale_b
+//   effective_recurrent_to_cell_scale_a
+//   effective_recurrent_to_cell_scale_b
+//   effective_recurrent_to_output_scale_a
+//   effective_recurrent_to_output_scale_b
+//   effective_proj_scale_a                  - optional
+//   effective_proj_scale_b                  - optional
+//
+// Gate biases of size 'n_cell':
+//   input_bias_ptr                 - optional
+//   forget_bias_ptr
+//   cell_bias_ptr
+//   output_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   layer_norm_input_weight_ptr    - optional
+//   layer_norm_forput_weight_ptr   - optional
+//   layer_norm_cell_weight_ptr     - optional
+//   layer_norm_output_weight_ptr   - optional
+//
+// Layer norm scales of size 'n_cell'.
+//   layer_norm_input_scale_a     - optional
+//   layer_norm_input_scale_b     - optional
+//   layer_norm_forget_scale_a    - optional
+//   layer_norm_forget_scale_b    - optional
+//   layer_norm_cell_scale_a      - optional
+//   layer_norm_cell_scale_b      - optional
+//   layer_norm_output_scale_a    - optional
+//   layer_norm_output_scale_b    - optional
+//
+// Scalar values:
+//   quantized_cell_clip: quantized clip value for cell.
+//   quantized_proj_clip: quantized clip value for projection.
+//   cell_scale: the power of two scale for cell state.
+//
+// Zero points:
+//   activation_zp: zero point of activation
+//   hidden_zp: zero point for hidden state.
+//
+// Temporary pre-allocated storage for the calculation. Each is of size n_cell *
+// n_batch.
+//   scratch_0
+//   scratch_1
+//   scratch_2
+//   scratch_3
+//   scratch_4
+//   scratch_5
+//   scratch_6
+//   scratch_7
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr       - size 'n_batch * n_output'
+// TODO(b/148688698): Move zero point calculation into Prepare().
+void LstmStepInteger(
+    const int8_t* input_ptr, int32_t input_zp,
+    const int8_t* input_to_input_weight_ptr,
+    int32_t effective_input_to_input_scale_a,
+    int32_t effective_input_to_input_scale_b,
+    const int8_t* input_to_forget_weight_ptr,
+    int32_t effective_input_to_forget_scale_a,
+    int32_t effective_input_to_forget_scale_b,
+    const int8_t* input_to_cell_weight_ptr,
+    int32_t effective_input_to_cell_scale_a,
+    int32_t effective_input_to_cell_scale_b,
+    const int8_t* input_to_output_weight_ptr,
+    int32_t effective_input_to_output_scale_a,
+    int32_t effective_input_to_output_scale_b,
+    const int8_t* recurrent_to_input_weight_ptr,
+    int32_t effective_recurrent_to_input_scale_a,
+    int32_t effective_recurrent_to_input_scale_b,
+    const int8_t* recurrent_to_forget_weight_ptr,
+    int32_t effective_recurrent_to_forget_scale_a,
+    int32_t effective_recurrent_to_forget_scale_b,
+    const int8_t* recurrent_to_cell_weight_ptr,
+    int32_t effective_recurrent_to_cell_scale_a,
+    int32_t effective_recurrent_to_cell_scale_b,
+    const int8_t* recurrent_to_output_weight_ptr,
+    int32_t effective_recurrent_to_output_scale_a,
+    int32_t effective_recurrent_to_output_scale_b,
+    const int8_t* cell_to_input_weight_ptr,
+    int32_t effective_cell_to_input_scale_a,
+    int32_t effective_cell_to_input_scale_b,
+    const int8_t* cell_to_forget_weight_ptr,
+    int32_t effective_cell_to_forget_scale_a,
+    int32_t effective_cell_to_forget_scale_b,
+    const int8_t* cell_to_output_weight_ptr,
+    int32_t effective_cell_to_output_scale_a,
+    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
+    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
+    const int16_t* layer_norm_input_weight_ptr,
+    int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
+    const int16_t* layer_norm_forget_weight_ptr,
+    int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
+    const int16_t* layer_norm_cell_weight_ptr, int32_t layer_norm_cell_scale_a,
+    int32_t layer_norm_cell_scale_b,
+    const int16_t* layer_norm_output_weight_ptr,
+    int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
+    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
+    const int32_t* cell_bias_ptr, const int32_t* output_bias_ptr,
+    const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
+    const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
+    const int32_t* intermediate_zp, int32 quantized_cell_clip,
+    int32 quantized_proj_clip, int32 n_batch, int32 n_cell, int32 n_input,
+    int32 n_output, int32 output_batch_leading_dim, int8_t* activation_ptr,
+    int32_t activation_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
+    int16_t* scratch7) {
+  // Forget gate.
+  memset(scratch0, 0, n_batch * n_cell);
+  memset(scratch1, 0, n_batch * n_cell);
+  tensor_utils::MatrixBatchVectorMultiply(
+      input_ptr, input_zp, input_to_forget_weight_ptr,
+      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
+      n_batch, n_input, n_cell, scratch0, intermediate_zp[4]);
+
+  tensor_utils::MatrixBatchVectorMultiply(
+      activation_ptr, activation_zp, recurrent_to_forget_weight_ptr,
+      effective_recurrent_to_forget_scale_a,
+      effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell,
+      scratch1, intermediate_zp[5]);
+
+  tensor_utils::TwoGateSaturationgAdd(
+      scratch0, intermediate_zp[4], scratch1, intermediate_zp[5],
+      intermediate_scale_a[2], intermediate_scale_b[2], intermediate_scale_a[3],
+      intermediate_scale_b[3], n_batch, n_cell, scratch2);
+
+  // Forget gate layer norm.
+  tensor_utils::ApplyLayerNormFloat(
+      scratch2, layer_norm_forget_weight_ptr, layer_norm_forget_scale_a,
+      layer_norm_forget_scale_b, forget_bias_ptr, n_batch, n_cell, scratch2);
+
+  // Forget gate sigmoid.
+  tensor_utils::ApplySigmoidFloat(scratch2, n_batch, n_cell, scratch2);
+
+  // Update gate.
+  memset(scratch0, 0, n_batch * n_cell);
+  memset(scratch1, 0, n_batch * n_cell);
+  tensor_utils::MatrixBatchVectorMultiply(
+      input_ptr, input_zp, input_to_cell_weight_ptr,
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
+      n_input, n_cell, scratch0, intermediate_zp[7]);
+
+  tensor_utils::MatrixBatchVectorMultiply(
+      activation_ptr, activation_zp, recurrent_to_cell_weight_ptr,
+      effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
+      n_batch, n_output, n_cell, scratch1, intermediate_zp[8]);
+
+  tensor_utils::TwoGateSaturationgAdd(
+      scratch0, intermediate_zp[7], scratch1, intermediate_zp[8],
+      intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
+      intermediate_scale_b[5], n_batch, n_cell, scratch3);
+
+  // Update gate with layer norm.
+  tensor_utils::ApplyLayerNormFloat(
+      scratch3, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
+      layer_norm_cell_scale_b, cell_bias_ptr, n_batch, n_cell, scratch3);
+
+  // Update gate tanh.
+  tensor_utils::ApplyTanhFloat(scratch3, n_batch, n_cell, -12, scratch3);
+
+  // Output gate.
+  memset(scratch0, 0, n_batch * n_cell);
+  memset(scratch1, 0, n_batch * n_cell);
+  tensor_utils::MatrixBatchVectorMultiply(
+      input_ptr, input_zp, input_to_output_weight_ptr,
+      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
+      n_batch, n_input, n_cell, scratch0, intermediate_zp[10]);
+
+  tensor_utils::MatrixBatchVectorMultiply(
+      activation_ptr, activation_zp, recurrent_to_output_weight_ptr,
+      effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell,
+      scratch1, intermediate_zp[11]);
+
+  tensor_utils::TwoGateSaturationgAdd(
+      scratch0, intermediate_zp[10], scratch1, intermediate_zp[11],
+      intermediate_scale_a[6], intermediate_scale_b[6], intermediate_scale_a[7],
+      intermediate_scale_b[7], n_batch, n_cell, scratch4);
+
+  // Output gate with layer norm.
+  tensor_utils::ApplyLayerNormFloat(
+      scratch4, layer_norm_output_weight_ptr, layer_norm_output_scale_a,
+      layer_norm_output_scale_b, output_bias_ptr, n_batch, n_cell, scratch4);
+
+  // Output gate sigmoid.
+  tensor_utils::ApplySigmoidFloat(scratch4, n_batch, n_cell, scratch4);
+
+  // Input gate with cifg
+  tensor_utils::Sub1Vector(scratch2, n_batch * n_cell, scratch5);
+
+  // New cell.
+  tensor_utils::CwiseMul(scratch2, cell_ptr, n_batch, n_cell, 15 + 15 - 15,
+                         scratch6);
+
+  tensor_utils::CwiseMul(scratch5, scratch3, n_batch, n_cell, 15 + 15 - 15,
+                         scratch7);
+
+  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_ptr);
+
+  if (quantized_cell_clip > 0) {
+    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+  }
+
+  // Cell to hidden.
+  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15, scratch2);
+
+  std::vector<int16_t> hidden(n_batch * n_cell);
+  tensor_utils::CwiseMul(scratch4, scratch2, n_batch, n_cell, 15 + 15 - 15,
+                         scratch3);
+
+  // Projection.
+  tensor_utils::MatrixBatchVectorMultiply(
+      scratch3, proj_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
+      proj_bias_ptr, n_batch, n_cell, n_output, activation_zp, output_ptr);
+
+  // Projection clipping.
+  if (quantized_proj_clip > 0) {
+    tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
+                                n_output);
+  }
+
+  // Copy output to activation.
+  memcpy(activation_ptr, output_ptr, n_batch * n_output * sizeof(int8_t));
+}
+
 }  // namespace
 
 // LINT.IfChange
@@ -1692,6 +1958,186 @@ TfLiteStatus EvalInteger8x8_16(
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalInteger8x8_8(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    const lstm_eval::IntegerLstmParameter* integer_lstm_param,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    TfLiteTensor* scratch6, TfLiteTensor* scratch7) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = input->dims->data[0];
+    n_batch = input->dims->data[1];
+  }
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Weights and states.
+  const int8_t* input_to_input_weight_ptr =
+      GetTensorData<int8_t>(input_to_input_weights);
+  const int8_t* recurrent_to_input_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_input_weights);
+  const int8_t* cell_to_input_weight_ptr =
+      GetTensorData<int8_t>(cell_to_input_weights);
+  const int8_t* input_to_forget_weight_ptr =
+      GetTensorData<int8_t>(input_to_forget_weights);
+  const int8_t* recurrent_to_forget_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_forget_weights);
+  const int8_t* cell_to_forget_weight_ptr =
+      GetTensorData<int8_t>(cell_to_forget_weights);
+  const int8_t* input_to_cell_weight_ptr =
+      GetTensorData<int8_t>(input_to_cell_weights);
+  const int8_t* recurrent_to_cell_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_cell_weights);
+  const int8_t* input_to_output_weight_ptr =
+      GetTensorData<int8_t>(input_to_output_weights);
+  const int8_t* recurrent_to_output_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_output_weights);
+  const int8_t* cell_to_output_weight_ptr =
+      GetTensorData<int8_t>(cell_to_output_weights);
+  const int8_t* proj_weight_ptr = GetTensorData<int8_t>(projection_weights);
+  const int16_t* layer_norm_input_weight_ptr =
+      GetTensorData<int16_t>(input_layer_norm_coefficients);
+  const int16_t* layer_norm_forget_weight_ptr =
+      GetTensorData<int16_t>(forget_layer_norm_coefficients);
+  const int16_t* layer_norm_cell_weight_ptr =
+      GetTensorData<int16_t>(cell_layer_norm_coefficients);
+  const int16_t* layer_norm_output_weight_ptr =
+      GetTensorData<int16_t>(output_layer_norm_coefficients);
+  const int32_t* input_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
+  const int32_t* forget_bias_ptr = GetTensorData<int32_t>(forget_gate_bias);
+  const int32_t* cell_bias_ptr = GetTensorData<int32_t>(cell_bias);
+  const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
+  const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
+  int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
+  int8_t* activation_ptr = GetTensorData<int8_t>(activation_state);
+  int8_t* output_ptr = nullptr;
+
+  const int32 input_zp = input->params.zero_point;
+  const int32 activation_zp = activation_state->params.zero_point;
+
+  // Get params for time/batch/sequence.
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  const int input_step = n_batch * n_input;
+  const int output_step = n_batch * output_batch_leading_dim;
+
+  for (int t = 0; t < max_time; t++) {
+    const int t_rel = t;
+    output_ptr = output->data.int8 + t_rel * output_step;
+
+    // Input can be int8 asymmetric or int16 symmetric.
+    const int8_t* input_ptr = input->data.int8 + t_rel * input_step;
+    lstm_eval::LstmStepInteger(
+        input_ptr, input_zp,
+
+        input_to_input_weight_ptr,
+        integer_lstm_param->effective_input_to_input_scale_a,
+        integer_lstm_param->effective_input_to_input_scale_b,
+
+        input_to_forget_weight_ptr,
+        integer_lstm_param->effective_input_to_forget_scale_a,
+        integer_lstm_param->effective_input_to_forget_scale_b,
+
+        input_to_cell_weight_ptr,
+        integer_lstm_param->effective_input_to_cell_scale_a,
+        integer_lstm_param->effective_input_to_cell_scale_b,
+
+        input_to_output_weight_ptr,
+        integer_lstm_param->effective_input_to_output_scale_a,
+        integer_lstm_param->effective_input_to_output_scale_b,
+
+        recurrent_to_input_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_input_scale_a,
+        integer_lstm_param->effective_recurrent_to_input_scale_b,
+
+        recurrent_to_forget_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_forget_scale_a,
+        integer_lstm_param->effective_recurrent_to_forget_scale_b,
+
+        recurrent_to_cell_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_cell_scale_a,
+        integer_lstm_param->effective_recurrent_to_cell_scale_b,
+
+        recurrent_to_output_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_output_scale_a,
+        integer_lstm_param->effective_recurrent_to_output_scale_b,
+
+        cell_to_input_weight_ptr,
+        integer_lstm_param->effective_cell_to_input_scale_a,
+        integer_lstm_param->effective_cell_to_input_scale_b,
+
+        cell_to_forget_weight_ptr,
+        integer_lstm_param->effective_cell_to_forget_scale_a,
+        integer_lstm_param->effective_cell_to_forget_scale_b,
+
+        cell_to_output_weight_ptr,
+        integer_lstm_param->effective_cell_to_output_scale_a,
+        integer_lstm_param->effective_cell_to_output_scale_b,
+
+        proj_weight_ptr, integer_lstm_param->effective_proj_scale_a,
+        integer_lstm_param->effective_proj_scale_b,
+
+        layer_norm_input_weight_ptr,
+        integer_lstm_param->layer_norm_input_scale_a,
+        integer_lstm_param->layer_norm_input_scale_b,
+
+        layer_norm_forget_weight_ptr,
+        integer_lstm_param->layer_norm_forget_scale_a,
+        integer_lstm_param->layer_norm_forget_scale_b,
+
+        layer_norm_cell_weight_ptr, integer_lstm_param->layer_norm_cell_scale_a,
+        integer_lstm_param->layer_norm_cell_scale_b,
+
+        layer_norm_output_weight_ptr,
+        integer_lstm_param->layer_norm_output_scale_a,
+        integer_lstm_param->layer_norm_output_scale_b,
+
+        input_bias_ptr, forget_bias_ptr, cell_bias_ptr, output_bias_ptr,
+        proj_bias_ptr,
+
+        params, integer_lstm_param->intermediate_scale_a,
+        integer_lstm_param->intermediate_scale_b,
+        integer_lstm_param->intermediate_zp,
+        integer_lstm_param->quantized_cell_clip,
+        integer_lstm_param->quantized_proj_clip, n_batch, n_cell, n_input,
+        n_output, output_batch_leading_dim, activation_ptr, activation_zp,
+        cell_ptr, output_ptr, GetTensorData<int8_t>(scratch0),
+        GetTensorData<int8_t>(scratch1), GetTensorData<int16_t>(scratch2),
+        GetTensorData<int16_t>(scratch3), GetTensorData<int16_t>(scratch4),
+        GetTensorData<int16_t>(scratch5), GetTensorData<int16_t>(scratch6),
+        GetTensorData<int16_t>(scratch7));
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace lstm_eval
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index c61d396bb33..ca3f96391aa 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,7 +28,8 @@ namespace ops {
 namespace builtin {
 namespace lstm_eval {
 
-// Pamameters for quantized lstm.
+// Pamameters for integer LSTM.
+// Consider split this into two Integer Parameters if more fields are added.
 struct IntegerLstmParameter {
   int32_t effective_input_to_input_scale_a;
   int32_t effective_input_to_input_scale_b;
@@ -75,24 +76,24 @@ struct IntegerLstmParameter {
   int32_t cell_variance_guard;
   int32_t output_variance_guard;
 
-  // The fields are used for pre-computing zero_point * weight.
-  // We cannot use temporary tensors since temporary tensors are not alllocated
-  // yet until end of prepare.
-
-  // Forget gate.
+  // Pre-calculate bias + zero_point * weight.
+  // Unabled to use temporary tensors since those are used in Prepare() and
+  // scratch buffer is only allocated after Preapre().
   std::unique_ptr<int32_t[]> input_to_forget_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_forget_effective_bias;
-  // Modulation gate.
   std::unique_ptr<int32_t[]> input_to_cell_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_cell_effective_bias;
-  // Output gate.
   std::unique_ptr<int32_t[]> input_to_output_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_output_effective_bias;
-  // Input gate.
   std::unique_ptr<int32_t[]> input_to_input_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_input_effective_bias;
-  // Projection.
   std::unique_ptr<int32_t[]> projection_effective_bias;
+
+  // Scale and zero point for intermediate tensors.
+  // Used only in the 8x8_8 case.
+  int32_t intermediate_scale_a[8];
+  int32_t intermediate_scale_b[8];
+  int32_t intermediate_zp[12];
 };
 
 TfLiteStatus EvalFloat(
@@ -183,6 +184,32 @@ TfLiteStatus EvalInteger8x8_16(
     TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
     TfLiteTensor* scratch5, CpuBackendContext* context);
 
+TfLiteStatus EvalInteger8x8_8(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    const lstm_eval::IntegerLstmParameter* integer_lstm_param,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    TfLiteTensor* scratch6, TfLiteTensor* scratch7);
+
 }  // namespace lstm_eval
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 6c00fc8dc68..f426ffae0e0 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2756,6 +2756,483 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
   }
 }
 
+class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
+ public:
+  LSTMIntegerOpModel8x8_8(
+      int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+      bool use_peephole, bool use_projection_weights, bool use_projection_bias,
+      bool use_layer_norm, float cell_clip, float proj_clip,
+      const std::vector<std::vector<int>>& input_shapes,
+      const std::vector<std::pair<float, float>>& ranges,
+      const std::vector<std::pair<float, int>>& intermediates)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    EXPECT_EQ(input_shapes.size() + 1, ranges.size());
+    EXPECT_EQ(intermediates.size(), 12);
+    input_ = AddInput(
+        {TensorType_INT8, input_shapes[0], ranges[0].first, ranges[0].second});
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput({TensorType_INT8, input_shapes[1],
+                                          ranges[1].first, ranges[1].second});
+    }
+    input_to_forget_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[2], ranges[2].first, ranges[2].second});
+    input_to_cell_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[3], ranges[3].first, ranges[3].second});
+    input_to_output_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[4], ranges[4].first, ranges[4].second});
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ =
+          AddInput({TensorType_INT8, input_shapes[5], ranges[5].first,
+                    ranges[5].second});
+    }
+    recurrent_to_forget_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[6], ranges[6].first, ranges[6].second});
+    recurrent_to_cell_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[7], ranges[7].first, ranges[7].second});
+    recurrent_to_output_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[8], ranges[8].first, ranges[8].second});
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput({TensorType_INT16, input_shapes[9],
+                                           ranges[9].first, ranges[9].second});
+      }
+      cell_to_forget_weights_ = AddInput({TensorType_INT16, input_shapes[10],
+                                          ranges[10].first, ranges[10].second});
+      cell_to_output_weights_ = AddInput({TensorType_INT16, input_shapes[11],
+                                          ranges[11].first, ranges[11].second});
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput({TensorType_INT32, input_shapes[12],
+                                   ranges[12].first, ranges[12].second});
+    }
+    forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
+                                  ranges[13].first, ranges[13].second});
+    cell_bias_ = AddInput({TensorType_INT32, input_shapes[14], ranges[14].first,
+                           ranges[14].second});
+    output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
+                                  ranges[15].first, ranges[15].second});
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput({TensorType_INT8, input_shapes[16],
+                                      ranges[16].first, ranges[16].second});
+      if (use_projection_bias) {
+        projection_bias_ = AddInput({TensorType_INT32, input_shapes[17],
+                                     ranges[17].first, ranges[17].second});
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 input state tensors.
+    input_activation_state_ = AddInput({TensorType_INT16, input_shapes[18],
+                                        ranges[18].first, ranges[18].second},
+                                       true);
+    input_cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
+                                  ranges[19].first, ranges[19].second},
+                                 true);
+
+    // Layer norm weights.
+    if (use_layer_norm) {
+      if (use_cifg) {
+        input_layer_norm_coefficients_ = AddNullInput();
+      } else {
+        input_layer_norm_coefficients_ =
+            AddInput({TensorType_INT16, input_shapes[20], ranges[20].first,
+                      ranges[20].second});
+      }
+      forget_layer_norm_coefficients_ =
+          AddInput({TensorType_INT16, input_shapes[21], ranges[21].first,
+                    ranges[21].second});
+      cell_layer_norm_coefficients_ =
+          AddInput({TensorType_INT16, input_shapes[22], ranges[22].first,
+                    ranges[22].second});
+      output_layer_norm_coefficients_ =
+          AddInput({TensorType_INT16, input_shapes[23], ranges[23].first,
+                    ranges[23].second});
+    }
+
+    for (int i = 0; i < intermediates.size(); ++i) {
+      intermediates_[i] =
+          AddIntermediate(TensorType_INT16, {intermediates[i].first},
+                          {intermediates[i].second});
+    }
+
+    output_ = AddOutput({TensorType_INT8,
+                         {n_batch, n_output},
+                         ranges[24].first,
+                         ranges[24].second});
+
+    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+
+    // Do not apply delegate yet since tensor values are not known (and more
+    // specifically scales in quantized tensors are not known).
+    BuildInterpreter(input_shapes, /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(output_layer_norm_coefficients_, f);
+  }
+
+  void SetInputGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(projection_weights_, f);
+  }
+
+  void SetProjectionBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(projection_bias_, f);
+  }
+
+  void SetInput(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_, f);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_coefficients_;
+  int forget_layer_norm_coefficients_;
+  int cell_layer_norm_coefficients_;
+  int output_layer_norm_coefficients_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+  int input_activation_state_;
+  int input_cell_state_;
+
+  int intermediates_[12];
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
+  // Hyper parameters.
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float cell_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  // Model related weights.
+  const std::vector<float> input_to_input_weights = {
+      0.5,  0.6, 0.7,  -0.8, -0.9, 0.1,  0.2,  0.3,  -0.4, 0.5,
+      -0.8, 0.7, -0.6, 0.5,  -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+  const std::vector<float> input_to_forget_weights = {
+      -0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2, -0.4, 0.3,  -0.8,
+      -0.4, 0.3,  -0.5, -0.4, -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+  const std::vector<float> input_to_cell_weights = {
+      -0.4, -0.3, -0.2, -0.1, -0.5, 0.5, -0.2, -0.3, -0.2, -0.6,
+      0.6,  -0.1, -0.4, -0.3, -0.7, 0.7, -0.9, -0.5, 0.8,  0.6};
+
+  const std::vector<float> input_to_output_weights = {
+      -0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3, -0.3, -0.8, -0.2,
+      0.6,  -0.2, 0.4,  -0.7, -0.3, -0.5, 0.1, 0.5,  -0.6, -0.4};
+
+  const std::vector<float> input_gate_bias = {0.03, 0.15, 0.22, 0.38};
+
+  const std::vector<float> forget_gate_bias = {0.1, -0.3, -0.2, 0.1};
+
+  const std::vector<float> cell_gate_bias = {-0.05, 0.72, 0.25, 0.08};
+
+  const std::vector<float> output_gate_bias = {0.05, -0.01, 0.2, 0.1};
+
+  const std::vector<float> recurrent_to_input_weights = {
+      -0.2, -0.3, 0.4, 0.1, -0.5, 0.9, -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  const std::vector<float> recurrent_to_cell_weights = {
+      -0.3, 0.2, 0.1, -0.3, 0.8, -0.08, -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  const std::vector<float> recurrent_to_forget_weights = {
+      -0.5, -0.3, -0.5, -0.2, 0.6, 0.4, 0.9, 0.3, -0.1, 0.2, 0.5, 0.2};
+
+  const std::vector<float> recurrent_to_output_weights = {
+      0.3, -0.1, 0.1, -0.2, -0.5, -0.7, -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  const std::vector<float> input_layer_norm_coefficients = {0.1, 0.2, 0.3, 0.5};
+  const std::vector<float> forget_layer_norm_coefficients = {0.2, 0.2, 0.4,
+                                                             0.3};
+  const std::vector<float> cell_layer_norm_coefficients = {0.7, 0.2, 0.3, 0.8};
+  const std::vector<float> output_layer_norm_coefficients = {0.6, 0.2, 0.2,
+                                                             0.5};
+
+  const std::vector<float> projection_weights = {
+      -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
+  const std::vector<float> projection_bias = {0.1, 0.3, 0.5};
+
+  // Input shapes.
+  const std::vector<std::vector<int32_t>> inputs = {
+      {n_batch, n_input},  // input tensor
+
+      {0},                // input_to_input_weight tensor
+      {n_cell, n_input},  // input_to_forget_weight tensor
+      {n_cell, n_input},  // input_to_cell_weight tensor
+      {n_cell, n_input},  // input_to_output_weight tensor
+
+      {0},                 // recurrent_to_input_weight tensor
+      {n_cell, n_output},  // recurrent_to_forget_weight tensor
+      {n_cell, n_output},  // recurrent_to_cell_weight tensor
+      {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+      {0},  // cell_to_input_weight tensor
+      {0},  // cell_to_forget_weight tensor
+      {0},  // cell_to_output_weight tensor
+
+      {0},       // input_gate_bias tensor
+      {n_cell},  // forget_gate_bias tensor
+      {n_cell},  // cell_bias tensor
+      {n_cell},  // output_gate_bias tensor
+
+      {n_output, n_cell},  // projection_weight tensor
+      {n_output},          // projection_bias tensor
+
+      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_cell},    // cell_state tensor
+
+      {0},       // input_layer_norm_coefficient tensor
+      {n_cell},  // forget_layer_norm_coefficient tensor
+      {n_cell},  // cell_layer_norm_coefficient tensor
+      {n_cell},  // output_layer_norm_coefficient tensor
+  };
+
+  // Input ranges.
+  const std::vector<std::pair<float, float>> ranges = {
+      {-1.0, 127.0 / 128},  // input tensor
+      {-1.0, 1.0},          // input_to_input_weight tensor
+      {-1.0, 1.0},          // input_to_forget_weight tensor
+      {-1.0, 1.0},          // input_to_cell_weight tensor
+      {-1.0, 1.0},          // input_to_output_weight tensor
+
+      {-1.0, 1.0},  // recurrent_to_input_weight tensor
+      {-1.0, 1.0},  // recurrent_to_forget_weight tensor
+      {-1.0, 1.0},  // recurrent_to_cell_weight tensor
+      {-1.0, 1.0},  // recurrent_to_output_weight tensor
+
+      {-1, 1},  // cell_to_input_weight tensor
+      {-1, 1},  // cell_to_forget_weight tensor
+      {-1, 1},  // cell_to_output_weight tensor
+
+      {-100, 100},  // input_gate_bias tensor
+      {-100, 100},  // forget_gate_bias tensor
+      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // output_gate_bias tensor
+
+      {-0.5, 0.5},  // projection_weight tensor
+      {-1, 1},      // projection_bias tensor
+
+      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // cell_state tensor
+
+      {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
+      // Output scale is the same as input activation scale and only activation
+      // scale is used in the op, so this is only provided for clarity.
+      {-1.0, 32767.0 / 32768},  // output tensor.
+  };
+
+  // The scale and zero point of intermediate tensors.
+  std::vector<std::pair<float, int>> intermediates = {
+      {0.007059, 0}, {0.007812, 0}, {0.007059, 0}, {0.007812, 0},
+      {0.007, 0},    {0.007059, 0}, {0.007, 0},    {0.007, 0},
+      {0.007059, 0}, {0.007, 0},    {0.007, 0},    {0.3, 0}};
+
+  // Create model.
+  LSTMIntegerOpModel8x8_8 lstm(n_batch, n_input, n_cell, n_output,
+                               /*use_cifg=*/true, /*use_peephole=*/false,
+                               /*use_projection_weights=*/true,
+                               /*use_projection_bias=*/true,
+                               /*use_layer_norm=*/true, cell_clip, proj_clip,
+                               inputs, ranges, intermediates);
+
+  // Set weights.
+  // lstm.SetInputToInputWeights(input_to_input_weights);
+  lstm.SetInputToCellWeights(input_to_cell_weights);
+  lstm.SetInputToForgetWeights(input_to_forget_weights);
+  lstm.SetInputToOutputWeights(input_to_output_weights);
+
+  // lstm.SetInputGateBias(input_gate_bias);
+  lstm.SetCellBias(cell_gate_bias);
+  lstm.SetForgetGateBias(forget_gate_bias);
+  lstm.SetOutputGateBias(output_gate_bias);
+
+  // lstm.SetRecurrentToInputWeights(recurrent_to_input_weights);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights);
+
+  lstm.SetProjectionWeights(projection_weights);
+  lstm.SetProjectionBias(projection_bias);
+
+  // lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients);
+  lstm.SetForgetLayerNormCoefficients(forget_layer_norm_coefficients);
+  lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients);
+  lstm.SetOutputLayerNormCoefficients(output_layer_norm_coefficients);
+
+  // Model inputs. sequence -batch - input
+  const std::vector<std::vector<float>> lstm_input = {
+      {
+          0.7, 0.8, 0.1, 0.2, 0.3,  //
+          0.8, 0.1, 0.2, 0.4, 0.5,  //
+      },
+      {
+          0.2, 0.7, 0.7, 0.1, 0.7,  //
+          0.3, 0.2, 0.9, 0.8, 0.1,  //
+      },
+      {
+          0.7, 0.8, 0.1, 0.2, 0.3,  //
+          0.3, 0.2, 0.9, 0.8, 0.1,  //
+      },
+  };
+
+  // Expected outputs.
+  const std::vector<std::vector<int8_t>> expected_output = {
+      {127, 127, 127, 127, 127, 127},
+      {127, 127, 127, 127, 127, 127},
+      {127, 127, 127, 127, 127, 127},
+  };
+
+  // Invoke and verify the result.
+  const int input_sequence_size = lstm_input.size();
+  EXPECT_GT(input_sequence_size, 0);
+  for (int i = 0; i < input_sequence_size; ++i) {
+    lstm.SetInput(lstm_input[i]);
+    lstm.Invoke();
+    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(expected_output[i]));
+  }
+}
+
 #ifdef GTEST_HAS_DEATH_TEST
 TEST(LSTMOpModel, InvalidTypeTest) {
   const int n_batch = 1;
diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 44208007b8a..33d033b10b6 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -31,7 +31,7 @@ inline void InfiniteLoop() {
   while (1) {
   }
 }
-#define TFLITE_ASSERT_FALSE InfiniteLoop();
+
 #define TFLITE_ABORT InfiniteLoop();
 
 #else  // TF_LITE_MCU_DEBUG_LOG
@@ -47,14 +47,14 @@ inline void InfiniteLoop() {
 
 #define TFLITE_ABORT abort()
 
+#endif  // TF_LITE_MCU_DEBUG_LOG
+
 #ifdef NDEBUG
 #define TFLITE_ASSERT_FALSE (static_cast<void>(0))
 #else
 #define TFLITE_ASSERT_FALSE TFLITE_ABORT
 #endif
 
-#endif  // TF_LITE_MCU_DEBUG_LOG
-
 #define TF_LITE_FATAL(msg)  \
   do {                      \
     DEBUG_LOG(msg);         \
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 5e2de955983..e8eebd81025 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -210,7 +210,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SUM, Register_SUM(),
              /* min_version */ 1,
              /* max_version */ 2);
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index b24f0aa30d2..dabbf1395c8 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -50,7 +50,7 @@ struct SpaceToBatchNDContext {
 
 // Currently, only 4D NHWC input/output op_context are supported.
 // The 4D array need to have exactly 2 spatial dimensions.
-// TODO(nupurgarg): Support arbitrary dimension in SpaceToBatchND.
+// TODO(b/149952582): Support arbitrary dimension in SpaceToBatchND.
 const int kInputDimensionNum = 4;
 const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 9a3eddcfcb4..d0c765edaca 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -632,10 +632,17 @@ class SingleOpModel {
         dims_count);
     for (int i = 0; i < dims_count; i++) {
       const int metadata_idx = 2 * i;
+      auto array_segments =
+          CreateInt32Vector(builder_,
+                            builder_.CreateVector(dim_metadata[metadata_idx]))
+              .Union();
+      auto array_indices =
+          CreateInt32Vector(
+              builder_, builder_.CreateVector(dim_metadata[metadata_idx + 1]))
+              .Union();
       fb_dim_metadata[i] = CreateDimensionMetadata(
-          builder_, DimensionType_SPARSE_CSR, 0,
-          builder_.CreateVector(dim_metadata[metadata_idx]),
-          builder_.CreateVector(dim_metadata[metadata_idx + 1]));
+          builder_, DimensionType_SPARSE_CSR, 0, SparseIndexVector_Int32Vector,
+          array_segments, SparseIndexVector_Int32Vector, array_indices);
     }
 
     flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index edbe711d807..64f6bd05485 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -83,6 +83,18 @@ void CopyMultipleTimes(const T* in_data, int32_t in_size, M multiplier,
   }
 }
 
+template <typename M>
+void CopyStringMultipleTimes(const TfLiteTensor* in_data, int in_data_index,
+                             const int dimension_size, M multiplier,
+                             DynamicBuffer* buffer) {
+  for (M i = 0; i < multiplier; ++i) {
+    for (int j = 0; j < dimension_size; ++j) {
+      const auto string_ref = GetString(in_data, in_data_index + j);
+      buffer->AddString(string_ref.str, string_ref.len);
+    }
+  }
+}
+
 template <typename T, typename M>
 std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
                                      const T* in_data, const M* multipliers,
@@ -116,6 +128,38 @@ std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
       static_cast<int>(total_tiled_stride_size * multipliers[dimension]));
 }
 
+template <typename M>
+std::pair<int, int> TileStringOneDimension(
+    const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
+    int in_data_index, const M* multipliers, DynamicBuffer* buffer,
+    int buffer_index, int dimension, TfLiteTensor* out_data) {
+  const int dimension_size = in_dimensions.data[dimension];
+  if (dimension == in_dimensions.size - 1) {
+    CopyStringMultipleTimes(in_data, in_data_index, dimension_size,
+                            multipliers[dimension], buffer);
+    return {dimension_size,
+            dimension_size * static_cast<int>(multipliers[dimension])};
+  }
+
+  int total_stride_size = 0, total_tiled_stride_size = 0;
+  for (int i = 0; i < dimension_size; ++i) {
+    int stride_size, tiled_stride_size;
+    std::tie(stride_size, tiled_stride_size) = TileStringOneDimension(
+        in_dimensions, in_data, in_data_index + total_stride_size, multipliers,
+        buffer, buffer_index + total_tiled_stride_size, dimension + 1,
+        out_data);
+    total_stride_size += stride_size;
+    total_tiled_stride_size += tiled_stride_size;
+  }
+
+  buffer->WriteToTensor(out_data, /*new_shape=*/nullptr);
+  CopyStringMultipleTimes(out_data, buffer_index, total_tiled_stride_size,
+                          multipliers[dimension] - 1, buffer);
+
+  return {total_stride_size,
+          total_tiled_stride_size * static_cast<int>(multipliers[dimension])};
+}
+
 template <typename T>
 void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
           const TfLiteTensor* multipliers, TfLiteTensor* out_data) {
@@ -135,6 +179,26 @@ void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
       break;
   }
 }
+
+void TileString(const TfLiteIntArray& in_dimensions,
+                const TfLiteTensor* in_data, const TfLiteTensor* multipliers,
+                DynamicBuffer* buffer, TfLiteTensor* out_data) {
+  // Doing recursively tiling from top to down dimension.
+  switch (multipliers->type) {
+    case kTfLiteInt32:
+      TileStringOneDimension(in_dimensions, in_data, 0,
+                             GetTensorData<int32_t>(multipliers), buffer, 0, 0,
+                             out_data);
+      break;
+    case kTfLiteInt64:
+      TileStringOneDimension(in_dimensions, in_data, 0,
+                             GetTensorData<int64_t>(multipliers), buffer, 0, 0,
+                             out_data);
+      break;
+    default:
+      break;
+  }
+}
 }  // namespace
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -185,6 +249,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       Tile<int64_t>(*(input->dims), input, multipliers, output);
       break;
+    case kTfLiteString: {
+      DynamicBuffer buffer;
+      TileString(*(input->dims), input, multipliers, &buffer, output);
+      buffer.WriteToTensor(output, /*new_shape=*/nullptr);
+      break;
+    }
     case kTfLiteBool:
       Tile<bool>(*(input->dims), input, multipliers, output);
       break;
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index 79b791c8c92..5a7461a8127 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -202,6 +202,54 @@ TEST_P(TileTest, Int64Matrix64Multipliers) {
       /*multiply_type=*/TensorType_INT64, GetParam());
 }
 
+TEST_P(TileTest, StringMatrix) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  Check<std::string>(
+      /*input_shape=*/{2, 3},
+      /*input_data=*/{"AA", "AB", "AC", "BA", "BB", "BC"},
+      /*multipliers_data=*/{1, 2}, /*exp_output_shape=*/{2, 6},
+      /*exp_output_data=*/
+      {"AA", "AB", "AC", "AA", "AB", "AC", "BA", "BB", "BC", "BA", "BB", "BC"},
+      /*input_type=*/TensorType_STRING,
+      /*multiply_type=*/TensorType_INT32, GetParam());
+}
+
+TEST_P(TileTest, StringMatrix64Multipliers) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  Check<std::string, int64_t>(
+      /*input_shape=*/{2, 3},
+      /*input_data=*/{"AA", "AB", "AC", "BA", "BB", "BC"},
+      /*multipliers_data=*/{2, 1}, /*exp_output_shape=*/{4, 3},
+      /*exp_output_data=*/
+      {"AA", "AB", "AC", "BA", "BB", "BC", "AA", "AB", "AC", "BA", "BB", "BC"},
+      /*input_type=*/TensorType_STRING,
+      /*multiply_type=*/TensorType_INT64, GetParam());
+}
+
+TEST_P(TileTest, StringMatrix2) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  Check<std::string>(
+      /*input_shape=*/{3, 2, 1},
+      /*input_data=*/{"AA", "AB", "AC", "BA", "BB", "BC"},
+      /*multipliers_data=*/{2, 2, 2}, /*exp_output_shape=*/{6, 4, 2},
+      /*exp_output_data=*/
+      {"AA", "AA", "AB", "AB", "AA", "AA", "AB", "AB", "AC", "AC", "BA", "BA",
+       "AC", "AC", "BA", "BA", "BB", "BB", "BC", "BC", "BB", "BB", "BC", "BC",
+       "AA", "AA", "AB", "AB", "AA", "AA", "AB", "AB", "AC", "AC", "BA", "BA",
+       "AC", "AC", "BA", "BA", "BB", "BB", "BC", "BC", "BB", "BB", "BC", "BC"},
+      /*input_type=*/TensorType_STRING,
+      /*multiply_type=*/TensorType_INT32, GetParam());
+}
+
 INSTANTIATE_TEST_SUITE_P(TileTest, TileTest,
                          ::testing::Values(TestType::kConst,
                                            TestType::kDynamic));
diff --git a/tensorflow/lite/micro/examples/magic_wand/BUILD b/tensorflow/lite/micro/examples/magic_wand/BUILD
index ee428a8f5ba..7d6f3cdcecd 100644
--- a/tensorflow/lite/micro/examples/magic_wand/BUILD
+++ b/tensorflow/lite/micro/examples/magic_wand/BUILD
@@ -51,9 +51,6 @@ tflite_micro_cc_test(
 
 cc_library(
     name = "constants",
-    srcs = [
-        "constants.cc",
-    ],
     hdrs = [
         "constants.h",
     ],
diff --git a/tensorflow/lite/micro/examples/magic_wand/Makefile.inc b/tensorflow/lite/micro/examples/magic_wand/Makefile.inc
index 20dffad9de0..e956230c8f1 100644
--- a/tensorflow/lite/micro/examples/magic_wand/Makefile.inc
+++ b/tensorflow/lite/micro/examples/magic_wand/Makefile.inc
@@ -26,7 +26,6 @@ OUTPUT_HANDLER_TEST_HDRS := \
 tensorflow/lite/micro/examples/magic_wand/output_handler.h
 
 GESTURE_PREDICTOR_TEST_SRCS := \
-tensorflow/lite/micro/examples/magic_wand/constants.cc \
 tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc \
 tensorflow/lite/micro/examples/magic_wand/gesture_predictor_test.cc
 
@@ -48,7 +47,6 @@ tensorflow/lite/micro/examples/magic_wand/ring_micro_features_data.h
 magic_wand_SRCS := \
 tensorflow/lite/micro/examples/magic_wand/main.cc \
 tensorflow/lite/micro/examples/magic_wand/main_functions.cc \
-tensorflow/lite/micro/examples/magic_wand/constants.cc \
 tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc \
 tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.cc \
 tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc \
diff --git a/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.cc
index 6211d13bd77..33e0be435a6 100644
--- a/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.cc
@@ -22,7 +22,7 @@ TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
 }
 
 bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
-                       int length, bool reset_buffer) {
+                       int length) {
   begin_index += 3;
   // Reset begin_index to simulate behavior of loop buffer
   if (begin_index >= 600) begin_index = 0;
@@ -32,5 +32,7 @@ bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
   if (begin_index > 300) {
     for (int i = 0; i < length; ++i) input[i] = 0;
     return true;
-  } else { return false; }
+  } else {
+    return false;
+  }
 }
diff --git a/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h b/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h
index fa086f7f09e..5174cc0eb3e 100644
--- a/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h
+++ b/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h
@@ -24,6 +24,6 @@ limitations under the License.
 extern int begin_index;
 extern TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter);
 extern bool ReadAccelerometer(tflite::ErrorReporter* error_reporter,
-                              float* input, int length, bool reset_buffer);
+                              float* input, int length);
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
diff --git a/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler_test.cc b/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler_test.cc
index 7d35deba6aa..6c326d20b26 100644
--- a/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/accelerometer_handler_test.cc
@@ -33,15 +33,13 @@ TF_LITE_MICRO_TEST(TestAccelerometer) {
   float input[384] = {0.0};
   tflite::MicroErrorReporter micro_error_reporter;
   // Test that the function returns false before insufficient data is available
-  bool inference_flag =
-      ReadAccelerometer(&micro_error_reporter, input, 384, false);
+  bool inference_flag = ReadAccelerometer(&micro_error_reporter, input, 384);
   TF_LITE_MICRO_EXPECT_EQ(inference_flag, false);
 
   // Test that the function returns true once sufficient data is available to
   // fill the model's input buffer (128 sets of values)
   for (int i = 1; i <= 128; i++) {
-    inference_flag =
-        ReadAccelerometer(&micro_error_reporter, input, 384, false);
+    inference_flag = ReadAccelerometer(&micro_error_reporter, input, 384);
   }
   TF_LITE_MICRO_EXPECT_EQ(inference_flag, true);
 }
diff --git a/tensorflow/lite/micro/examples/magic_wand/arduino/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/arduino/accelerometer_handler.cc
index 679ef8c2df7..148a6f29c5c 100644
--- a/tensorflow/lite/micro/examples/magic_wand/arduino/accelerometer_handler.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/arduino/accelerometer_handler.cc
@@ -32,16 +32,17 @@ int sample_every_n;
 int sample_skip_counter = 1;
 
 TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
-  // Wait until we know the serial port is ready
-  while (!Serial) {
-  }
-
   // Switch on the IMU
   if (!IMU.begin()) {
     TF_LITE_REPORT_ERROR(error_reporter, "Failed to initialize IMU");
     return kTfLiteError;
   }
 
+  // Make sure we are pulling measurements into a FIFO.
+  // If you see an error on this line, make sure you have at least v1.1.0 of the
+  // Arduino_LSM9DS1 library installed.
+  IMU.setContinuousMode();
+
   // Determine how many measurements to keep in order to
   // meet kTargetHz
   float sample_rate = IMU.accelerationSampleRate();
@@ -53,13 +54,7 @@ TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
 }
 
 bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
-                       int length, bool reset_buffer) {
-  // Clear the buffer if required, e.g. after a successful prediction
-  if (reset_buffer) {
-    memset(save_data, 0, 600 * sizeof(float));
-    begin_index = 0;
-    pending_initial_data = true;
-  }
+                       int length) {
   // Keep track of whether we stored any new data
   bool new_data = false;
   // Loop through new samples and add to buffer
@@ -75,13 +70,32 @@ bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
       sample_skip_counter += 1;
       continue;
     }
-    // Write samples to our buffer, converting to milli-Gs
-    // and flipping y and x order for compatibility with
-    // model (sensor orientation is different on Arduino
-    // Nano BLE Sense compared with SparkFun Edge)
-    save_data[begin_index++] = y * 1000;
-    save_data[begin_index++] = x * 1000;
-    save_data[begin_index++] = z * 1000;
+    // Write samples to our buffer, converting to milli-Gs and rotating the axis
+    // order for compatibility with model (sensor orientation is different on
+    // Arduino Nano BLE Sense compared with SparkFun Edge).
+    // The expected orientation of the Arduino on the wand is with the USB port
+    // facing down the shaft towards the user's hand, with the reset button
+    // pointing at the user's face:
+    //
+    //                  ____
+    //                 |    |<- Arduino board
+    //                 |    |
+    //                 | () |  <- Reset button
+    //                 |    |
+    //                  -TT-   <- USB port
+    //                   ||
+    //                   ||<- Wand
+    //                  ....
+    //                   ||
+    //                   ||
+    //                   ()
+    //
+    const float norm_x = -z;
+    const float norm_y = y;
+    const float norm_z = x;
+    save_data[begin_index++] = norm_x * 1000;
+    save_data[begin_index++] = norm_y * 1000;
+    save_data[begin_index++] = norm_z * 1000;
     // Since we took a sample, reset the skip counter
     sample_skip_counter = 1;
     // If we reached the end of the circle buffer, reset
diff --git a/tensorflow/lite/micro/examples/magic_wand/arduino/constants.cc b/tensorflow/lite/micro/examples/magic_wand/arduino/constants.cc
deleted file mode 100644
index 6a0a37b6878..00000000000
--- a/tensorflow/lite/micro/examples/magic_wand/arduino/constants.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/magic_wand/constants.h"
-
-// The number of expected consecutive inferences for each gesture type.
-// Established with the Arduino Nano 33 BLE Sense.
-const int kConsecutiveInferenceThresholds[3] = {8, 5, 4};
diff --git a/tensorflow/lite/micro/examples/magic_wand/arduino/output_handler.cc b/tensorflow/lite/micro/examples/magic_wand/arduino/output_handler.cc
index f1329ca3472..ae2f570ea42 100644
--- a/tensorflow/lite/micro/examples/magic_wand/arduino/output_handler.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/arduino/output_handler.cc
@@ -24,15 +24,8 @@ void HandleOutput(tflite::ErrorReporter* error_reporter, int kind) {
     pinMode(LED_BUILTIN, OUTPUT);
     is_initialized = true;
   }
-  // Toggle the LED every time an inference is performed
-  static int count = 0;
-  ++count;
-  if (count & 1) {
-    digitalWrite(LED_BUILTIN, HIGH);
-  } else {
-    digitalWrite(LED_BUILTIN, LOW);
-  }
-  // Print some ASCII art for each gesture
+
+  // Print some ASCII art for each gesture and control the LED.
   if (kind == 0) {
     TF_LITE_REPORT_ERROR(
         error_reporter,
@@ -40,12 +33,14 @@ void HandleOutput(tflite::ErrorReporter* error_reporter, int kind) {
         "*\n\r  *     *   *     *\n\r   *   *     *   *\n\r    * *       "
         "* *\n\r     *         *\n\r");
   } else if (kind == 1) {
+    digitalWrite(LED_BUILTIN, HIGH);
     TF_LITE_REPORT_ERROR(
         error_reporter,
         "RING:\n\r          *\n\r       *     *\n\r     *         *\n\r "
         "   *           *\n\r     *         *\n\r       *     *\n\r      "
         "    *\n\r");
   } else if (kind == 2) {
+    digitalWrite(LED_BUILTIN, LOW);
     TF_LITE_REPORT_ERROR(
         error_reporter,
         "SLOPE:\n\r        *\n\r       *\n\r      *\n\r     *\n\r    "
diff --git a/tensorflow/lite/micro/examples/magic_wand/constants.cc b/tensorflow/lite/micro/examples/magic_wand/constants.cc
deleted file mode 100644
index 6866bd6f968..00000000000
--- a/tensorflow/lite/micro/examples/magic_wand/constants.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/examples/magic_wand/constants.h"
-
-// The number of expected consecutive inferences for each gesture type.
-// These defaults were established with the SparkFun Edge board.
-const int kConsecutiveInferenceThresholds[3] = {15, 12, 10};
diff --git a/tensorflow/lite/micro/examples/magic_wand/constants.h b/tensorflow/lite/micro/examples/magic_wand/constants.h
index 9e225419f8e..3f0da6c9a83 100644
--- a/tensorflow/lite/micro/examples/magic_wand/constants.h
+++ b/tensorflow/lite/micro/examples/magic_wand/constants.h
@@ -19,6 +19,20 @@ limitations under the License.
 // The expected accelerometer data sample frequency
 const float kTargetHz = 25;
 
-// The number of expected consecutive inferences for each gesture type
-extern const int kConsecutiveInferenceThresholds[3];
+// What gestures are supported.
+constexpr int kGestureCount = 4;
+constexpr int kWingGesture = 0;
+constexpr int kRingGesture = 1;
+constexpr int kSlopeGesture = 2;
+constexpr int kNoGesture = 3;
+
+// These control the sensitivity of the detection algorithm. If you're seeing
+// too many false positives or not enough true positives, you can try tweaking
+// these thresholds. Often, increasing the size of the training set will give
+// more robust results though, so consider retraining if you are seeing poor
+// predictions.
+constexpr float kDetectionThreshold = 0.8f;
+constexpr int kPredictionHistoryLength = 5;
+constexpr int kPredictionSuppressionDuration = 25;
+
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_CONSTANTS_H_
diff --git a/tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc b/tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc
index a7a71b23395..b09499a836d 100644
--- a/tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc
@@ -17,39 +17,56 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/magic_wand/constants.h"
 
-// How many times the most recent gesture has been matched in a row
-int continuous_count = 0;
-// The result of the last prediction
-int last_predict = -1;
+namespace {
+// State for the averaging algorithm we're using.
+float prediction_history[kGestureCount][kPredictionHistoryLength] = {};
+int prediction_history_index = 0;
+int prediction_suppression_count = 0;
+}  // namespace
 
 // Return the result of the last prediction
-// 0: wing, 1: ring, 2: slope, 3: unknown
+// 0: wing("W"), 1: ring("O"), 2: slope("angle"), 3: unknown
 int PredictGesture(float* output) {
-  // Find whichever output has a probability > 0.8 (they sum to 1)
-  int this_predict = -1;
-  for (int i = 0; i < 3; i++) {
-    if (output[i] > 0.8) this_predict = i;
+  // Record the latest predictions in our rolling history buffer.
+  for (int i = 0; i < kGestureCount; ++i) {
+    prediction_history[i][prediction_history_index] = output[i];
   }
-  // No gesture was detected above the threshold
-  if (this_predict == -1) {
-    continuous_count = 0;
-    last_predict = 3;
-    return 3;
+  // Figure out which slot to put the next predictions into.
+  ++prediction_history_index;
+  if (prediction_history_index >= kPredictionHistoryLength) {
+    prediction_history_index = 0;
   }
-  if (last_predict == this_predict) {
-    continuous_count += 1;
+
+  // Average the last n predictions for each gesture, and find which has the
+  // highest score.
+  int max_predict_index = -1;
+  float max_predict_score = 0.0f;
+  for (int i = 0; i < kGestureCount; i++) {
+    float prediction_sum = 0.0f;
+    for (int j = 0; j < kPredictionHistoryLength; ++j) {
+      prediction_sum += prediction_history[i][j];
+    }
+    const float prediction_average = prediction_sum / kPredictionHistoryLength;
+    if ((max_predict_index == -1) || (prediction_average > max_predict_score)) {
+      max_predict_index = i;
+      max_predict_score = prediction_average;
+    }
+  }
+
+  // If there's been a recent prediction, don't trigger a new one too soon.
+  if (prediction_suppression_count > 0) {
+    --prediction_suppression_count;
+  }
+  // If we're predicting no gesture, or the average score is too low, or there's
+  // been a gesture recognised too recently, return no gesture.
+  if ((max_predict_index == kNoGesture) ||
+      (max_predict_score < kDetectionThreshold) ||
+      (prediction_suppression_count > 0)) {
+    return kNoGesture;
   } else {
-    continuous_count = 0;
+    // Reset the suppression counter so we don't come up with another prediction
+    // too soon.
+    prediction_suppression_count = kPredictionSuppressionDuration;
+    return max_predict_index;
   }
-  last_predict = this_predict;
-  // If we haven't yet had enough consecutive matches for this gesture,
-  // report a negative result
-  if (continuous_count < kConsecutiveInferenceThresholds[this_predict]) {
-    return 3;
-  }
-  // Otherwise, we've seen a positive result, so clear all our variables
-  // and report it
-  continuous_count = 0;
-  last_predict = -1;
-  return this_predict;
 }
diff --git a/tensorflow/lite/micro/examples/magic_wand/gesture_predictor_test.cc b/tensorflow/lite/micro/examples/magic_wand/gesture_predictor_test.cc
index 880cf373b1c..7488666df05 100644
--- a/tensorflow/lite/micro/examples/magic_wand/gesture_predictor_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/gesture_predictor_test.cc
@@ -21,48 +21,45 @@ limitations under the License.
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(SuccessfulPrediction) {
-  // Use the threshold from the 0th gesture
-  int threshold = kConsecutiveInferenceThresholds[0];
-  float probabilities[4] = {1.0, 0.0, 0.0, 0.0};
+  // Use the threshold from the 0th gesture.
+  float probabilities[kGestureCount] = {kDetectionThreshold, 0.0, 0.0, 0.0};
   int prediction;
-  // Loop just too few times to trigger a prediction
-  for (int i = 0; i <= threshold - 1; i++) {
+  // Loop just too few times to trigger a prediction.
+  for (int i = 0; i < kPredictionHistoryLength - 1; i++) {
     prediction = PredictGesture(probabilities);
-    TF_LITE_MICRO_EXPECT_EQ(prediction, 3);
+    TF_LITE_MICRO_EXPECT_EQ(prediction, kNoGesture);
   }
   // Call once more, triggering a prediction
-  // for category 0
+  // for category 0.
   prediction = PredictGesture(probabilities);
   TF_LITE_MICRO_EXPECT_EQ(prediction, 0);
 }
 
 TF_LITE_MICRO_TEST(FailPartWayThere) {
-  // Use the threshold from the 0th gesture
-  int threshold = kConsecutiveInferenceThresholds[0];
-  float probabilities[4] = {1.0, 0.0, 0.0, 0.0};
+  // Use the threshold from the 0th gesture.
+  float probabilities[kGestureCount] = {kDetectionThreshold, 0.0, 0.0, 0.0};
   int prediction;
-  // Loop just too few times to trigger a prediction
-  for (int i = 0; i <= threshold - 1; i++) {
+  // Loop just too few times to trigger a prediction.
+  for (int i = 0; i <= kPredictionHistoryLength - 1; i++) {
     prediction = PredictGesture(probabilities);
-    TF_LITE_MICRO_EXPECT_EQ(prediction, 3);
+    TF_LITE_MICRO_EXPECT_EQ(prediction, kNoGesture);
   }
-  // Call with a different prediction, triggering a failure
+  // Call with a different prediction, triggering a failure.
   probabilities[0] = 0.0;
   probabilities[2] = 1.0;
   prediction = PredictGesture(probabilities);
-  TF_LITE_MICRO_EXPECT_EQ(prediction, 3);
+  TF_LITE_MICRO_EXPECT_EQ(prediction, kNoGesture);
 }
 
 TF_LITE_MICRO_TEST(InsufficientProbability) {
-  // Use the threshold from the 0th gesture
-  int threshold = kConsecutiveInferenceThresholds[0];
-  // Below the probability threshold of 0.8
-  float probabilities[4] = {0.7, 0.0, 0.0, 0.0};
+  // Just below the detection threshold.
+  float probabilities[kGestureCount] = {kDetectionThreshold - 0.1f, 0.0, 0.0,
+                                        0.0};
   int prediction;
   // Loop the exact right number of times
-  for (int i = 0; i <= threshold; i++) {
+  for (int i = 0; i <= kPredictionHistoryLength; i++) {
     prediction = PredictGesture(probabilities);
-    TF_LITE_MICRO_EXPECT_EQ(prediction, 3);
+    TF_LITE_MICRO_EXPECT_EQ(prediction, kNoGesture);
   }
 }
 
diff --git a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
index 0913c372dc0..51e6e593cd1 100644
--- a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/magic_wand/main_functions.h"
 
 #include "tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h"
+#include "tensorflow/lite/micro/examples/magic_wand/constants.h"
 #include "tensorflow/lite/micro/examples/magic_wand/gesture_predictor.h"
 #include "tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.h"
 #include "tensorflow/lite/micro/examples/magic_wand/output_handler.h"
@@ -39,9 +40,6 @@ int input_length;
 // determined by experimentation.
 constexpr int kTensorArenaSize = 60 * 1024;
 uint8_t tensor_arena[kTensorArenaSize];
-
-// Whether we should clear the buffer next time we fetch data
-bool should_clear_buffer = false;
 }  // namespace
 
 // The name of this function is important for Arduino compatibility.
@@ -80,15 +78,15 @@ void setup() {
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
                                tflite::ops::micro::Register_SOFTMAX());
 
-  // Build an interpreter to run the model with
+  // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
       model, micro_op_resolver, tensor_arena, kTensorArenaSize, error_reporter);
   interpreter = &static_interpreter;
 
-  // Allocate memory from the tensor_arena for the model's tensors
+  // Allocate memory from the tensor_arena for the model's tensors.
   interpreter->AllocateTensors();
 
-  // Obtain pointer to the model's input tensor
+  // Obtain pointer to the model's input tensor.
   model_input = interpreter->input(0);
   if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
       (model_input->dims->data[1] != 128) ||
@@ -108,14 +106,13 @@ void setup() {
 }
 
 void loop() {
-  // Attempt to read new data from the accelerometer
-  bool got_data = ReadAccelerometer(error_reporter, model_input->data.f,
-                                    input_length, should_clear_buffer);
-  // Don't try to clear the buffer again
-  should_clear_buffer = false;
-  // If there was no new data, wait until next time
+  // Attempt to read new data from the accelerometer.
+  bool got_data =
+      ReadAccelerometer(error_reporter, model_input->data.f, input_length);
+  // If there was no new data, wait until next time.
   if (!got_data) return;
-  // Run inference, and report any error
+
+  // Run inference, and report any error.
   TfLiteStatus invoke_status = interpreter->Invoke();
   if (invoke_status != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed on index: %d\n",
@@ -124,8 +121,7 @@ void loop() {
   }
   // Analyze the results to obtain a prediction
   int gesture_index = PredictGesture(interpreter->output(0)->data.f);
-  // Clear the buffer next time we read data
-  should_clear_buffer = gesture_index < 3;
+
   // Produce an output
   HandleOutput(error_reporter, gesture_index);
 }
diff --git a/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc
index b6a36dec059..ff527c78d46 100644
--- a/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/sparkfun_edge/accelerometer_handler.cc
@@ -147,15 +147,7 @@ TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
 }
 
 bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
-                       int length, bool reset_buffer) {
-  // Clear the buffer if required, e.g. after a successful prediction
-  if (reset_buffer) {
-    memset(save_data, 0, 600 * sizeof(float));
-    begin_index = 0;
-    pending_initial_data = true;
-    // Wait 10ms after a reset to avoid hang
-    am_util_delay_ms(10);
-  }
+                       int length) {
   // Check FIFO buffer for new samples
   lis2dh12_fifo_src_reg_t status;
   if (lis2dh12_fifo_status_get(&dev_ctx, &status)) {
diff --git a/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc
index f03f199d215..734f40a9473 100644
--- a/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/esp/Makefile.inc
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ifeq ($(TARGET), "esp")
+ifeq ($(TARGET), esp)
 
 # Adding some esp specific files in the main CMakeLists.txt
 ESP_MICRO_SPEECH_SRCS := \
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 14cb48ec7fb..0df1052e4b7 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -190,6 +190,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -203,6 +204,7 @@ tflite_micro_cc_test(
     deps = [
         ":portable_optimized_ops_resolver",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index 50673fe79d1..2201223d0c4 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/micro/testing/test_utils.h"
@@ -23,16 +24,25 @@ namespace tflite {
 namespace testing {
 namespace {
 
-static const int kMaxFilterChannels = 64;
-static const int kMaxBiasChannels = 64;
+constexpr int kMaxFilterChannels = 64;
+constexpr int kMaxBiasChannels = 64;
 
+// Index of the output tensor in context->tensors, specific to
+// DepthwiseConv.
+constexpr int kOutputTensorIndex = 3;
+
+// Creates a DepthwiseConv opeerator, calls it with the provided input tensors
+// and some defaults parameters, and compares the output with
+// expected_output_data.
+//
+// The tensors parameter contains both the input tensors as well as a
+// preallocated output tensor into which the output is stored.
 template <typename T>
-TfLiteStatus ValidateDepthwiseConvGoldens(TfLiteTensor* tensors,
-                                          int tensors_size,
-                                          const T* expected_output_data,
-                                          T* output_data, int output_length,
+TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
+                                          int output_length,
                                           TfLiteFusedActivation activation,
-                                          float tolerance = 1e-5) {
+                                          float tolerance, int tensors_size,
+                                          TfLiteTensor* tensors) {
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, &context);
 
@@ -87,6 +97,8 @@ TfLiteStatus ValidateDepthwiseConvGoldens(TfLiteTensor* tensors,
   if (registration->free) {
     registration->free(&context, user_data);
   }
+
+  const T* output_data = tflite::GetTensorData<T>(&tensors[kOutputTensorIndex]);
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
                               tolerance);
@@ -118,8 +130,8 @@ void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
 
-  ValidateDepthwiseConvGoldens(tensors, tensors_size, expected_output_data,
-                               output_data, output_dims_count, activation);
+  ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count,
+                               activation, 1e-5, tensors_size, tensors);
 }
 
 void TestDepthwiseConvQuantizedPerLayer(
@@ -171,8 +183,8 @@ void TestDepthwiseConvQuantizedPerLayer(
 
   AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
                      output_zero_point);
-  ValidateDepthwiseConvGoldens(tensors, tensors_size, golden_quantized,
-                               output_data, output_dims_count, activation, 1.0);
+  ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, activation,
+                               1.0, tensors_size, tensors);
 }
 
 void TestDepthwiseConvQuantizedPerChannel(
@@ -240,9 +252,9 @@ void TestDepthwiseConvQuantizedPerChannel(
                      output_dims_count, output_scale, output_zero_point);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, ValidateDepthwiseConvGoldens(
-                     tensors, tensors_size, expected_output_data_quantized,
-                     output_data, output_dims_count, activation, 1.0));
+      kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
+                                              output_dims_count, activation,
+                                              1.0, tensors_size, tensors));
 }
 
 }  // namespace
@@ -261,7 +273,6 @@ TF_LITE_MICRO_TEST(SimpleTest) {
                                  5, 6, 7, 8, 13, -14, 15,  -16};
   const int bias_elements = 4;
   const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
   const float bias_values[] = {1, 2, 3, 4};
   const float golden[] = {
       71, -34, 99, -20, 91, -26, 127, -4,
@@ -290,7 +301,6 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
       71, -34, 99, -20, 91, -26, 127, -4,
   };
   const int output_shape[] = {4, 1, 2, 1, 4};
-  const int output_dims_count = 8;
 
   const float input_scale = 0.5f;
   const int input_zero_point = 128;
@@ -646,16 +656,16 @@ TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
   quant->scale->size = 2;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
-                        tensors, tensors_size, golden_quantized, output_data,
-                        output_size, kTfLiteActNone));
+                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
+                        tensors_size, tensors));
 
   // Set scale back to correct dimension, and make zero point array too short.
   quant->scale->size = filter_shape[0];
   quant->zero_point->size = 2;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
-                        tensors, tensors_size, golden_quantized, output_data,
-                        output_size, kTfLiteActNone));
+                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
+                        tensors_size, tensors));
 }
 
 TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
@@ -751,8 +761,8 @@ TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
 
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::testing::ValidateDepthwiseConvGoldens(
-                     tensors, tensors_size, golden_quantized, output_data,
-                     output_dims_count, kTfLiteActNone));
+                     golden_quantized, output_dims_count, kTfLiteActNone, 1e-5,
+                     tensors_size, tensors));
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/pooling.cc b/tensorflow/lite/micro/kernels/pooling.cc
index b340ac965e5..1c2ba12e386 100644
--- a/tensorflow/lite/micro/kernels/pooling.cc
+++ b/tensorflow/lite/micro/kernels/pooling.cc
@@ -77,8 +77,7 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
 void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
                           const TfLitePoolParams* params, const OpData* data,
                           const TfLiteTensor* input, TfLiteTensor* output) {
-  assert(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
@@ -128,7 +127,7 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
                       TfLitePoolParams* params, OpData* data,
                       const TfLiteTensor* input, TfLiteTensor* output) {
-  assert(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
 
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
diff --git a/tensorflow/lite/micro/kernels/reshape.cc b/tensorflow/lite/micro/kernels/reshape.cc
index d7a5a6181fb..376c612ef59 100644
--- a/tensorflow/lite/micro/kernels/reshape.cc
+++ b/tensorflow/lite/micro/kernels/reshape.cc
@@ -69,18 +69,20 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, ReshapeOutput(context, node), kTfLiteOk);
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  if (ReshapeOutput(context, node) != kTfLiteOk) {
-    return kTfLiteError;
-  }
 
-  for (size_t i = 0; i < input->bytes; ++i) {
-    output->data.raw[i] = input->data.raw[i];
+  // Do nothing for in-place reshape.
+  if (input->data.raw != output->data.raw) {
+    // Otherwise perform reshape with copy.
+    for (size_t i = 0; i < input->bytes; ++i) {
+      output->data.raw[i] = input->data.raw[i];
+    }
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index e252e13fa50..16d70a0159e 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -77,7 +77,13 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
   TF_LITE_MICRO_EXPECT_EQ(registration->free, nullptr);
 
   if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+    // Error can happen either in Prepare or eval stage.
+    auto status = registration->prepare(&context, &node);
+    if (status == kTfLiteError && expect_failure) {
+      return;
+    } else {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, status);
+    }
   }
   if (expect_failure) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
similarity index 99%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/conv.cc
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
index 981f638eb6b..dd68a7b74b6 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
similarity index 99%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/depthwise_conv.cc
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
index 44ec720dba5..da0f440e523 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
similarity index 99%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index b23b50d38ce..bacea7c2eb4 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <stdint.h>
 #include <xtensa/tie/xt_hifi2.h>
 
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/fully_connected.cc
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index 315799c7c9b..91761b00c2a 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
similarity index 97%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/quantize.cc
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
index d10e780d4a3..0859c54cce4 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/softmax.cc
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
similarity index 99%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
index 1a0b0fe12c8..f337572fd5c 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/xtensa-hifimini/utils.h
rename to tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
index f96b249f1a8..7f7f7b6d980 100644
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -266,6 +266,7 @@ size_t GreedyMemoryPlanner::GetMaximumMemorySize() {
 }
 
 void GreedyMemoryPlanner::PrintMemoryPlan(ErrorReporter* error_reporter) {
+#ifndef NDEBUG
   CalculateOffsetsIfNeeded();
 
   for (int i = 0; i < buffer_count_; ++i) {
@@ -333,6 +334,7 @@ void GreedyMemoryPlanner::PrintMemoryPlan(ErrorReporter* error_reporter) {
     line[kLineWidth] = 0;
     TF_LITE_REPORT_ERROR(error_reporter, "%s", line);
   }
+#endif
 }
 
 int GreedyMemoryPlanner::GetBufferCount() { return buffer_count_; }
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index bc69eb55315..6f894541572 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -100,6 +100,7 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter) {
+#ifndef NDEBUG
   printf("Interpreter has %zu tensors and %zu nodes\n",
          interpreter->tensors_size(), interpreter->operators_size());
   printf("Inputs:");
@@ -137,6 +138,7 @@ void PrintInterpreterState(MicroInterpreter* interpreter) {
     printf("  Outputs:");
     PrintTfLiteIntVector(node.outputs);
   }
+#endif
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
index ada030bb7c8..087b08d5e01 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
@@ -38,7 +38,7 @@ unzip -q ${LIBRARY_ZIP} -d ${ARDUINO_LIBRARIES_DIR}
 # Installs all dependencies for Arduino
 InstallLibraryDependencies () {
   # Required by magic_wand
-  ${ARDUINO_CLI_TOOL} lib install Arduino_LSM9DS1@1.0.0
+  ${ARDUINO_CLI_TOOL} lib install Arduino_LSM9DS1@1.1.0
 
   # Required by person_detection
   ${ARDUINO_CLI_TOOL} lib install JPEGDecoder@1.8.0
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8ce1974c437..1dc45f88cb9 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -68,10 +68,10 @@ MICROLITE_LIBS := -lm
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS := -O3
 CXXFLAGS += -std=c++11 -g -DTF_LITE_STATIC_MEMORY
 CXXFLAGS += -fno-rtti
-CCFLAGS := -DNDEBUG -g -DTF_LITE_STATIC_MEMORY
+CCFLAGS := -g -DTF_LITE_STATIC_MEMORY
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
 TARGET_TOOLCHAIN_PREFIX :=
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 65155dfedb8..878067cf083 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -8,10 +8,13 @@ ifeq ($(TARGET), bluepill)
   $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
   $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
+  # TODO(b/149943573): It may be worthwhile to remove -DNDEBUG if we can get the
+  # bluepill target to compile without it.
   PLATFORM_FLAGS = \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
+    -DNDEBUG \
     -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index 906a7f3c10a..8775ccdbbbc 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -3,6 +3,8 @@
 #   - Hexagon SDK 3.5 Toolkit (for hexagon-clang++, hexagon-sim).
 #   - HEXAGON_SDK_PREFIX environment variable must be set to location of
 #     Hexagon_SDK/<version>/ on your machine.
+#   - HEXAGON_CPU_VER: The CPU version to use, will cause a compiler exception
+#                  without providing a version. Acceptable values: v55-v67
 #
 # Unlike other targets, there is not currently a way to automatically download
 # the Hexagon SDK.  For this reason, users are required to manually download
@@ -42,8 +44,8 @@ ifeq ($(TARGET), hexagon)
     -fomit-frame-pointer \
     -fpermissive \
     -funsigned-char \
-    -mcpu=v66 \
-    -mv66
+    -mcpu=$(HEXAGON_CPU_VER) \
+    -m$(HEXAGON_CPU_VER)
 
   TARGET_TOOLCHAIN_PREFIX := hexagon-
   CXX_TOOL := clang++
diff --git a/tensorflow/lite/micro/tools/make/templates/library.properties b/tensorflow/lite/micro/tools/make/templates/library.properties
index 1ee869ae128..e41fd8d8fbe 100644
--- a/tensorflow/lite/micro/tools/make/templates/library.properties
+++ b/tensorflow/lite/micro/tools/make/templates/library.properties
@@ -1,5 +1,5 @@
 name=Arduino_TensorFlowLite
-version=1.15.0-ALPHA
+version=2.1.0-ALPHA
 author=TensorFlow Authors
 maintainer=Pete Warden <petewarden@google.com>
 sentence=Allows you to run machine learning models locally on your device.
diff --git a/tensorflow/lite/micro/xtensa-xpg/debug_log.cc b/tensorflow/lite/micro/xtensa-xpg/debug_log.cc
new file mode 100644
index 00000000000..a95a084978b
--- /dev/null
+++ b/tensorflow/lite/micro/xtensa-xpg/debug_log.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the DebugLog() function that's required for a
+// platform to support the TensorFlow Lite for Microcontrollers library. This is
+// the only function that's absolutely required to be available on a target
+// device, since it's used for communicating test results back to the host so
+// that we can verify the implementation is working correctly.
+// It's designed to be as easy as possible to supply an implementation though.
+// On platforms that have a POSIX stack or C library, it can be written as a
+// single call to `fprintf(stderr, "%s", s)` to output a string to the error
+// stream of the console, but if there's no OS or C library available, there's
+// almost always an equivalent way to write out a string to some serial
+// interface that can be used instead. For example on Arm M-series MCUs, calling
+// the `bkpt #0xAB` assembler instruction will output the string in r1 to
+// whatever debug serial connection is available. If you're running mbed, you
+// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
+// `pc.printf("%s", s)`.
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/debug_log.cc or the mbed one on
+// tensorflow/lite/micro/mbed/debug_log.cc.
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstdio>
+
+extern "C" void DebugLog(const char* s) {
+#ifndef NDEBUG
+  fprintf(stderr, "%s", s);
+#endif
+}
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 22a4cf21213..8f470713e1b 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -40,6 +41,45 @@ namespace {
 ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
   return e ? e : DefaultErrorReporter();
 }
+
+template <typename T>
+void Copy(const T* data_ptr, TfLiteIntArray** arr) {
+  int size = data_ptr->values()->size();
+  *arr = TfLiteIntArrayCreate(size);
+  for (int i = 0; i < size; i++) {
+    (*arr)->data[i] = static_cast<int>(data_ptr->values()->Get(i));
+  }
+}
+
+void ParseSparseIndexVector(const DimensionMetadata* src,
+                            TfLiteDimensionMetadata* tgt) {
+  switch (src->array_segments_type()) {
+    case SparseIndexVector_Int32Vector:
+      Copy(src->array_segments_as_Int32Vector(), &tgt->array_segments);
+      break;
+    case SparseIndexVector_Uint16Vector:
+      Copy(src->array_segments_as_Uint16Vector(), &tgt->array_segments);
+      break;
+    case SparseIndexVector_Uint8Vector:
+      Copy(src->array_segments_as_Uint8Vector(), &tgt->array_segments);
+      break;
+    default:
+      break;
+  }
+  switch (src->array_indices_type()) {
+    case SparseIndexVector_Int32Vector:
+      Copy(src->array_indices_as_Int32Vector(), &tgt->array_indices);
+      break;
+    case SparseIndexVector_Uint16Vector:
+      Copy(src->array_indices_as_Uint16Vector(), &tgt->array_indices);
+      break;
+    case SparseIndexVector_Uint8Vector:
+      Copy(src->array_indices_as_Uint8Vector(), &tgt->array_indices);
+      break;
+    default:
+      break;
+  }
+}
 }  // namespace
 
 const char* kEmptyTensorName = "";
@@ -422,8 +462,6 @@ TfLiteStatus InterpreterBuilder::ParseQuantization(
   return kTfLiteOk;
 }
 
-// TODO(b/145614687): Add sparse tensor verification check in
-// lite/tools/verifier.cc.
 TfLiteStatus InterpreterBuilder::ParseSparsity(
     const SparsityParameters* src_sparsity, TfLiteSparsity** sparsity_ptr) {
   if (!src_sparsity) {
@@ -492,18 +530,7 @@ TfLiteStatus InterpreterBuilder::ParseSparsity(
     if (tgt_metadata->format == kTfLiteDimDense) {
       tgt_metadata->dense_size = src_metadata->dense_size();
     } else {
-      const int array_segments_size = src_metadata->array_segments()->size();
-      tgt_metadata->array_segments = TfLiteIntArrayCreate(array_segments_size);
-      for (int j = 0; j < array_segments_size; j++) {
-        tgt_metadata->array_segments->data[j] =
-            src_metadata->array_segments()->Get(j);
-      }
-      const int array_indices_size = src_metadata->array_indices()->size();
-      tgt_metadata->array_indices = TfLiteIntArrayCreate(array_indices_size);
-      for (int j = 0; j < array_indices_size; j++) {
-        tgt_metadata->array_indices->data[j] =
-            src_metadata->array_indices()->Get(j);
-      }
+      ParseSparseIndexVector(src_metadata, tgt_metadata);
     }
   }
 
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index a4c763e4b28..acf630c93cf 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -89,8 +89,8 @@ OperatorDetails GetOperatorDetails(const tflite::Interpreter& interpreter,
 }  // namespace
 
 ProfileSummarizer::ProfileSummarizer(
-    std::unique_ptr<ProfileSummaryFormatter> summary_formatter)
-    : summary_formatter_(std::move(summary_formatter)) {
+    std::shared_ptr<ProfileSummaryFormatter> summary_formatter)
+    : summary_formatter_(summary_formatter) {
   // Create stats calculator for the primary graph.
   stats_calculator_map_[0] = std::unique_ptr<tensorflow::StatsCalculator>(
       new tensorflow::StatsCalculator(
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index 1348761b792..960c6ba7c3d 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -32,8 +32,8 @@ namespace profiling {
 class ProfileSummarizer {
  public:
   explicit ProfileSummarizer(
-      std::unique_ptr<ProfileSummaryFormatter> summary_formatter =
-          std::make_unique<ProfileSummaryDefaultFormatter>());
+      std::shared_ptr<ProfileSummaryFormatter> summary_formatter =
+          std::make_shared<ProfileSummaryDefaultFormatter>());
   virtual ~ProfileSummarizer() {}
 
   // Process profile events to update statistics for operator invocations.
@@ -70,7 +70,7 @@ class ProfileSummarizer {
   std::unique_ptr<tensorflow::StatsCalculator> delegate_stats_calculator_;
 
   // Summary formatter for customized output formats.
-  std::unique_ptr<ProfileSummaryFormatter> summary_formatter_;
+  std::shared_ptr<ProfileSummaryFormatter> summary_formatter_;
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 3965a4ac275..bda8898d879 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -46,6 +46,7 @@ from tensorflow.lite.python.convert_saved_model import freeze_saved_model as _fr
 from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.lite.python.interpreter import load_delegate  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
+from tensorflow.lite.python.op_hint import is_ophint_converted as _is_ophint_converted
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
 from tensorflow.lite.python.optimize import calibrator as _calibrator
 from tensorflow.lite.python.util import build_debug_info_func as _build_debug_info_func
@@ -78,7 +79,7 @@ from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
 # The default value of `experimental_new_converter`.
-_USE_EXPERIMENTAL_NEW_CONVERTER = False
+_USE_EXPERIMENTAL_NEW_CONVERTER = True
 
 
 @_tf_export("lite.Optimize")
@@ -1028,10 +1029,16 @@ class TFLiteConverter(TFLiteConverterBase):
         (self.inference_type == constants.INT8 and
          (post_training_optimize or weight_only_quantize))):
       try:
+        # TODO(b/150163103): Merge `disabling lower using switch merge' calls.
+        # Grappler will also try to lower while loop into switch merge
+        # representation which is undesired for Ophints, so we simply remove
+        # those attributes to prevent Grappler from doing so.
+        graph_def = _convert_to_constants.disable_lower_using_switch_merge(
+            optimized_graph)
         # Run function inlining optimization to ensure any models generated
         # through the from_frozen_graph path have been inlined.
         optimized_graph = _run_graph_optimizations(
-            self._graph_def,
+            graph_def,
             self._input_tensors,
             self._output_tensors,
             config=self._grappler_config(["function"]))
@@ -1128,6 +1135,10 @@ class TFLiteConverter(TFLiteConverterBase):
         tensor.set_shape(shape)
 
   def _is_unknown_shapes_allowed(self):
+    # Ophint Converted nodes will need the shapes to be known.
+    if _is_ophint_converted(self._graph_def):
+      return False
+
     if not super(TFLiteConverter, self)._is_unknown_shapes_allowed():
       return False
 
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 5aa212a573f..67b4d903a5b 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -79,7 +79,9 @@ import six as _six
 from tensorflow.core.framework import attr_value_pb2 as _attr_value_pb2
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.core.framework import node_def_pb2 as _node_def_pb2
+from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_util as _tensor_util
 # TODO(aselle): publicize these apis if we continue to use these.
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
@@ -996,10 +998,26 @@ def _convert_single_op_hint_to_stub(call,
   # Delegate to each operand to produce the proper new input for this stub node.
   # In particular, an aggregate input will now be a Pack of some previously
   # non-fused things.
-  for input_index in sorted_input_indices:
-    inputs = call.inputs[input_index]
-    input_name = inputs.aggregate_and_return_name_for_input(out)
-    new_node.input.append(input_name)
+
+  optional_input_node = _node_def_pb2.NodeDef()
+  optional_input_node.name = "Const" + str(_uuid.uuid1().hex)
+  optional_input_node.op = "Const"
+  optional_input_node.attr["dtype"].CopyFrom(
+      _attr_value_pb2.AttrValue(type=_dtypes.float32.as_datatype_enum))
+  optional_input_node.attr["value"].CopyFrom(
+      _attr_value_pb2.AttrValue(
+          tensor=_tensor_util.make_tensor_proto([-1], _dtypes.float32, [1])))
+  out.node.extend([optional_input_node])
+
+  max_index = max(sorted_input_indices) + 1
+  for cur_index in range(max_index):
+    if cur_index in sorted_input_indices:
+      inputs = call.inputs[cur_index]
+      input_name = inputs.aggregate_and_return_name_for_input(out)
+      new_node.input.append(input_name)
+    else:
+      new_node.input.append(optional_input_node.name)
+
   new_node.attr[OpHint.TFLITE_INPUT_INDICES].list.i.extend(sorted_input_indices)
 
   # Create the function
@@ -1010,11 +1028,15 @@ def _convert_single_op_hint_to_stub(call,
   # Now call each output argument to give them a chance to make the proper
   # output type and add it to our new_node.
   output_dtypes = []
-  for output_index in sorted_output_indices:
-    output = call.outputs[output_index]
-    output_dtype = (
-        output.aggregate_and_return_name_for_output(new_node.name, output_index,
-                                                    out))
+  max_output_index = max(sorted_output_indices) + 1
+  for cur_index in range(max_output_index):
+    if cur_index in sorted_output_indices:
+      output = call.outputs[cur_index]
+      output_dtype = (
+          output.aggregate_and_return_name_for_output(new_node.name, cur_index,
+                                                      out))
+    else:
+      output_dtype = optional_input_node.attr["type"].i
     output_dtypes.append(output_dtype)
   new_node.attr["_output_types"].list.type[:] = output_dtypes
   # TODO(aselle): what is right here?
@@ -1258,6 +1280,18 @@ def find_all_hinted_output_nodes(session=None, graph_def=None):
   return hinted_outputs_nodes
 
 
+def is_ophint_converted(graph_def):
+  if graph_def is None:
+    raise ValueError("Must provide the graph_def.")
+  ophint_converted = False
+  for node in graph_def.node:
+    attr = node.attr
+    if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+      ophint_converted = True
+      break
+  return ophint_converted
+
+
 @_tf_export(v1=["lite.experimental.convert_op_hints_to_stubs"])
 def convert_op_hints_to_stubs(session=None,
                               graph_def=None,
@@ -1291,7 +1325,10 @@ def convert_op_hints_to_stubs(session=None,
 
 
 _allowed_symbols = [
-    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new",
-    "find_all_hinted_output_nodes"
+    "OpHint",
+    "convert_op_hints_to_stubs",
+    "convert_op_hints_to_stubs_new",
+    "find_all_hinted_output_nodes",
+    "is_ophint_converted",
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index e7d5eaed29f..0553e293f6a 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -108,6 +108,27 @@ enum DimensionType : byte {
   SPARSE_CSR = 1,
 }
 
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
 table DimensionMetadata {
   // Whether a dimension is dense or sparse.
   format:DimensionType;
@@ -123,8 +144,8 @@ table DimensionMetadata {
   //     format, where the first array is row pointers and the second array is
   //     column indices).
   dense_size:int;
-  array_segments:[int];
-  array_indices:[int];
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
 }
 
 // Parameters to encode a sparse TfLite tensor.
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index b91a2f0343d..282433d7ccc 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -28,6 +28,15 @@ struct CustomQuantizationT;
 struct QuantizationParameters;
 struct QuantizationParametersT;
 
+struct Int32Vector;
+struct Int32VectorT;
+
+struct Uint16Vector;
+struct Uint16VectorT;
+
+struct Uint8Vector;
+struct Uint8VectorT;
+
 struct DimensionMetadata;
 struct DimensionMetadataT;
 
@@ -522,6 +531,119 @@ inline const char *EnumNameDimensionType(DimensionType e) {
   return EnumNamesDimensionType()[index];
 }
 
+enum SparseIndexVector {
+  SparseIndexVector_NONE = 0,
+  SparseIndexVector_Int32Vector = 1,
+  SparseIndexVector_Uint16Vector = 2,
+  SparseIndexVector_Uint8Vector = 3,
+  SparseIndexVector_MIN = SparseIndexVector_NONE,
+  SparseIndexVector_MAX = SparseIndexVector_Uint8Vector
+};
+
+inline const SparseIndexVector (&EnumValuesSparseIndexVector())[4] {
+  static const SparseIndexVector values[] = {
+    SparseIndexVector_NONE,
+    SparseIndexVector_Int32Vector,
+    SparseIndexVector_Uint16Vector,
+    SparseIndexVector_Uint8Vector
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesSparseIndexVector() {
+  static const char * const names[] = {
+    "NONE",
+    "Int32Vector",
+    "Uint16Vector",
+    "Uint8Vector",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameSparseIndexVector(SparseIndexVector e) {
+  if (e < SparseIndexVector_NONE || e > SparseIndexVector_Uint8Vector) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesSparseIndexVector()[index];
+}
+
+template<typename T> struct SparseIndexVectorTraits {
+  static const SparseIndexVector enum_value = SparseIndexVector_NONE;
+};
+
+template<> struct SparseIndexVectorTraits<Int32Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Int32Vector;
+};
+
+template<> struct SparseIndexVectorTraits<Uint16Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint16Vector;
+};
+
+template<> struct SparseIndexVectorTraits<Uint8Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector_Uint8Vector;
+};
+
+struct SparseIndexVectorUnion {
+  SparseIndexVector type;
+  void *value;
+
+  SparseIndexVectorUnion() : type(SparseIndexVector_NONE), value(nullptr) {}
+  SparseIndexVectorUnion(SparseIndexVectorUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(SparseIndexVector_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  SparseIndexVectorUnion(const SparseIndexVectorUnion &) FLATBUFFERS_NOEXCEPT;
+  SparseIndexVectorUnion &operator=(const SparseIndexVectorUnion &u) FLATBUFFERS_NOEXCEPT
+    { SparseIndexVectorUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  SparseIndexVectorUnion &operator=(SparseIndexVectorUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~SparseIndexVectorUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    using RT = typename std::remove_reference<T>::type;
+    Reset();
+    type = SparseIndexVectorTraits<typename RT::TableType>::enum_value;
+    if (type != SparseIndexVector_NONE) {
+      value = new RT(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, SparseIndexVector type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  Int32VectorT *AsInt32Vector() {
+    return type == SparseIndexVector_Int32Vector ?
+      reinterpret_cast<Int32VectorT *>(value) : nullptr;
+  }
+  const Int32VectorT *AsInt32Vector() const {
+    return type == SparseIndexVector_Int32Vector ?
+      reinterpret_cast<const Int32VectorT *>(value) : nullptr;
+  }
+  Uint16VectorT *AsUint16Vector() {
+    return type == SparseIndexVector_Uint16Vector ?
+      reinterpret_cast<Uint16VectorT *>(value) : nullptr;
+  }
+  const Uint16VectorT *AsUint16Vector() const {
+    return type == SparseIndexVector_Uint16Vector ?
+      reinterpret_cast<const Uint16VectorT *>(value) : nullptr;
+  }
+  Uint8VectorT *AsUint8Vector() {
+    return type == SparseIndexVector_Uint8Vector ?
+      reinterpret_cast<Uint8VectorT *>(value) : nullptr;
+  }
+  const Uint8VectorT *AsUint8Vector() const {
+    return type == SparseIndexVector_Uint8Vector ?
+      reinterpret_cast<const Uint8VectorT *>(value) : nullptr;
+  }
+};
+
+bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type);
+bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
 enum BuiltinOperator {
   BuiltinOperator_ADD = 0,
   BuiltinOperator_AVERAGE_POOL_2D = 1,
@@ -2802,6 +2924,7 @@ inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
 inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *custom = nullptr) {
+  if (custom) { _fbb.ForceVectorAlignment(custom->size(), sizeof(uint8_t), 16); }
   auto custom__ = custom ? _fbb.CreateVector<uint8_t>(*custom) : 0;
   return tflite::CreateCustomQuantization(
       _fbb,
@@ -2966,12 +3089,201 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
 
 flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct Int32VectorT : public flatbuffers::NativeTable {
+  typedef Int32Vector TableType;
+  std::vector<int32_t> values;
+  Int32VectorT() {
+  }
+};
+
+struct Int32Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Int32VectorT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<int32_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Int32VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Int32VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Int32Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Int32VectorBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<int32_t>> values) {
+    fbb_.AddOffset(Int32Vector::VT_VALUES, values);
+  }
+  explicit Int32VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  Int32VectorBuilder &operator=(const Int32VectorBuilder &);
+  flatbuffers::Offset<Int32Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Int32Vector>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Int32Vector> CreateInt32Vector(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> values = 0) {
+  Int32VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<int32_t>(*values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<Int32Vector> CreateInt32Vector(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Uint16VectorT : public flatbuffers::NativeTable {
+  typedef Uint16Vector TableType;
+  std::vector<uint16_t> values;
+  Uint16VectorT() {
+  }
+};
+
+struct Uint16Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Uint16VectorT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<uint16_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<uint16_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Uint16VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint16VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Uint16Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Uint16VectorBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint16_t>> values) {
+    fbb_.AddOffset(Uint16Vector::VT_VALUES, values);
+  }
+  explicit Uint16VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  Uint16VectorBuilder &operator=(const Uint16VectorBuilder &);
+  flatbuffers::Offset<Uint16Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Uint16Vector>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint16_t>> values = 0) {
+  Uint16VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint16_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<uint16_t>(*values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Uint8VectorT : public flatbuffers::NativeTable {
+  typedef Uint8Vector TableType;
+  std::vector<uint8_t> values;
+  Uint8VectorT() {
+  }
+};
+
+struct Uint8Vector FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Uint8VectorT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<uint8_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  Uint8VectorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Uint8Vector> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Uint8VectorBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> values) {
+    fbb_.AddOffset(Uint8Vector::VT_VALUES, values);
+  }
+  explicit Uint8VectorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  Uint8VectorBuilder &operator=(const Uint8VectorBuilder &);
+  flatbuffers::Offset<Uint8Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Uint8Vector>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> values = 0) {
+  Uint8VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<uint8_t>(*values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct DimensionMetadataT : public flatbuffers::NativeTable {
   typedef DimensionMetadata TableType;
   DimensionType format;
   int32_t dense_size;
-  std::vector<int32_t> array_segments;
-  std::vector<int32_t> array_indices;
+  SparseIndexVectorUnion array_segments;
+  SparseIndexVectorUnion array_indices;
   DimensionMetadataT()
       : format(DimensionType_DENSE),
         dense_size(0) {
@@ -2983,8 +3295,10 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FORMAT = 4,
     VT_DENSE_SIZE = 6,
-    VT_ARRAY_SEGMENTS = 8,
-    VT_ARRAY_INDICES = 10
+    VT_ARRAY_SEGMENTS_TYPE = 8,
+    VT_ARRAY_SEGMENTS = 10,
+    VT_ARRAY_INDICES_TYPE = 12,
+    VT_ARRAY_INDICES = 14
   };
   DimensionType format() const {
     return static_cast<DimensionType>(GetField<int8_t>(VT_FORMAT, 0));
@@ -2992,20 +3306,48 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t dense_size() const {
     return GetField<int32_t>(VT_DENSE_SIZE, 0);
   }
-  const flatbuffers::Vector<int32_t> *array_segments() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_ARRAY_SEGMENTS);
+  SparseIndexVector array_segments_type() const {
+    return static_cast<SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_SEGMENTS_TYPE, 0));
   }
-  const flatbuffers::Vector<int32_t> *array_indices() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_ARRAY_INDICES);
+  const void *array_segments() const {
+    return GetPointer<const void *>(VT_ARRAY_SEGMENTS);
+  }
+  template<typename T> const T *array_segments_as() const;
+  const Int32Vector *array_segments_as_Int32Vector() const {
+    return array_segments_type() == SparseIndexVector_Int32Vector ? static_cast<const Int32Vector *>(array_segments()) : nullptr;
+  }
+  const Uint16Vector *array_segments_as_Uint16Vector() const {
+    return array_segments_type() == SparseIndexVector_Uint16Vector ? static_cast<const Uint16Vector *>(array_segments()) : nullptr;
+  }
+  const Uint8Vector *array_segments_as_Uint8Vector() const {
+    return array_segments_type() == SparseIndexVector_Uint8Vector ? static_cast<const Uint8Vector *>(array_segments()) : nullptr;
+  }
+  SparseIndexVector array_indices_type() const {
+    return static_cast<SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_INDICES_TYPE, 0));
+  }
+  const void *array_indices() const {
+    return GetPointer<const void *>(VT_ARRAY_INDICES);
+  }
+  template<typename T> const T *array_indices_as() const;
+  const Int32Vector *array_indices_as_Int32Vector() const {
+    return array_indices_type() == SparseIndexVector_Int32Vector ? static_cast<const Int32Vector *>(array_indices()) : nullptr;
+  }
+  const Uint16Vector *array_indices_as_Uint16Vector() const {
+    return array_indices_type() == SparseIndexVector_Uint16Vector ? static_cast<const Uint16Vector *>(array_indices()) : nullptr;
+  }
+  const Uint8Vector *array_indices_as_Uint8Vector() const {
+    return array_indices_type() == SparseIndexVector_Uint8Vector ? static_cast<const Uint8Vector *>(array_indices()) : nullptr;
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FORMAT) &&
            VerifyField<int32_t>(verifier, VT_DENSE_SIZE) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_SEGMENTS_TYPE) &&
            VerifyOffset(verifier, VT_ARRAY_SEGMENTS) &&
-           verifier.VerifyVector(array_segments()) &&
+           VerifySparseIndexVector(verifier, array_segments(), array_segments_type()) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_INDICES_TYPE) &&
            VerifyOffset(verifier, VT_ARRAY_INDICES) &&
-           verifier.VerifyVector(array_indices()) &&
+           VerifySparseIndexVector(verifier, array_indices(), array_indices_type()) &&
            verifier.EndTable();
   }
   DimensionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3013,6 +3355,30 @@ struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   static flatbuffers::Offset<DimensionMetadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
+template<> inline const Int32Vector *DimensionMetadata::array_segments_as<Int32Vector>() const {
+  return array_segments_as_Int32Vector();
+}
+
+template<> inline const Uint16Vector *DimensionMetadata::array_segments_as<Uint16Vector>() const {
+  return array_segments_as_Uint16Vector();
+}
+
+template<> inline const Uint8Vector *DimensionMetadata::array_segments_as<Uint8Vector>() const {
+  return array_segments_as_Uint8Vector();
+}
+
+template<> inline const Int32Vector *DimensionMetadata::array_indices_as<Int32Vector>() const {
+  return array_indices_as_Int32Vector();
+}
+
+template<> inline const Uint16Vector *DimensionMetadata::array_indices_as<Uint16Vector>() const {
+  return array_indices_as_Uint16Vector();
+}
+
+template<> inline const Uint8Vector *DimensionMetadata::array_indices_as<Uint8Vector>() const {
+  return array_indices_as_Uint8Vector();
+}
+
 struct DimensionMetadataBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -3022,10 +3388,16 @@ struct DimensionMetadataBuilder {
   void add_dense_size(int32_t dense_size) {
     fbb_.AddElement<int32_t>(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0);
   }
-  void add_array_segments(flatbuffers::Offset<flatbuffers::Vector<int32_t>> array_segments) {
+  void add_array_segments_type(SparseIndexVector array_segments_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_SEGMENTS_TYPE, static_cast<uint8_t>(array_segments_type), 0);
+  }
+  void add_array_segments(flatbuffers::Offset<void> array_segments) {
     fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments);
   }
-  void add_array_indices(flatbuffers::Offset<flatbuffers::Vector<int32_t>> array_indices) {
+  void add_array_indices_type(SparseIndexVector array_indices_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_INDICES_TYPE, static_cast<uint8_t>(array_indices_type), 0);
+  }
+  void add_array_indices(flatbuffers::Offset<void> array_indices) {
     fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices);
   }
   explicit DimensionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
@@ -3044,32 +3416,20 @@ inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
     flatbuffers::FlatBufferBuilder &_fbb,
     DimensionType format = DimensionType_DENSE,
     int32_t dense_size = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> array_segments = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> array_indices = 0) {
+    SparseIndexVector array_segments_type = SparseIndexVector_NONE,
+    flatbuffers::Offset<void> array_segments = 0,
+    SparseIndexVector array_indices_type = SparseIndexVector_NONE,
+    flatbuffers::Offset<void> array_indices = 0) {
   DimensionMetadataBuilder builder_(_fbb);
   builder_.add_array_indices(array_indices);
   builder_.add_array_segments(array_segments);
   builder_.add_dense_size(dense_size);
+  builder_.add_array_indices_type(array_indices_type);
+  builder_.add_array_segments_type(array_segments_type);
   builder_.add_format(format);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadataDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    DimensionType format = DimensionType_DENSE,
-    int32_t dense_size = 0,
-    const std::vector<int32_t> *array_segments = nullptr,
-    const std::vector<int32_t> *array_indices = nullptr) {
-  auto array_segments__ = array_segments ? _fbb.CreateVector<int32_t>(*array_segments) : 0;
-  auto array_indices__ = array_indices ? _fbb.CreateVector<int32_t>(*array_indices) : 0;
-  return tflite::CreateDimensionMetadata(
-      _fbb,
-      format,
-      dense_size,
-      array_segments__,
-      array_indices__);
-}
-
 flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SparsityParametersT : public flatbuffers::NativeTable {
@@ -9896,6 +10256,7 @@ inline flatbuffers::Offset<Buffer> CreateBuffer(
 inline flatbuffers::Offset<Buffer> CreateBufferDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *data = nullptr) {
+  if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
   auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
   return tflite::CreateBuffer(
       _fbb,
@@ -10157,6 +10518,7 @@ inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuff
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->custom.size(), sizeof(uint8_t), 16);
   auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
   return tflite::CreateCustomQuantization(
       _fbb,
@@ -10207,6 +10569,84 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
       _quantized_dimension);
 }
 
+inline Int32VectorT *Int32Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Int32VectorT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Int32Vector::UnPackTo(Int32VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<Int32Vector> Int32Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInt32Vector(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Int32Vector> CreateInt32Vector(flatbuffers::FlatBufferBuilder &_fbb, const Int32VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Int32VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      _values);
+}
+
+inline Uint16VectorT *Uint16Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Uint16VectorT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Uint16Vector::UnPackTo(Uint16VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<Uint16Vector> Uint16Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUint16Vector(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Uint16Vector> CreateUint16Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint16VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Uint16VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      _values);
+}
+
+inline Uint8VectorT *Uint8Vector::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Uint8VectorT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUint8Vector(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Uint8Vector> CreateUint8Vector(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Uint8VectorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      _values);
+}
+
 inline DimensionMetadataT *DimensionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new DimensionMetadataT();
   UnPackTo(_o, _resolver);
@@ -10218,8 +10658,10 @@ inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffer
   (void)_resolver;
   { auto _e = format(); _o->format = _e; };
   { auto _e = dense_size(); _o->dense_size = _e; };
-  { auto _e = array_segments(); if (_e) { _o->array_segments.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_segments[_i] = _e->Get(_i); } } };
-  { auto _e = array_indices(); if (_e) { _o->array_indices.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_indices[_i] = _e->Get(_i); } } };
+  { auto _e = array_segments_type(); _o->array_segments.type = _e; };
+  { auto _e = array_segments(); if (_e) _o->array_segments.value = SparseIndexVectorUnion::UnPack(_e, array_segments_type(), _resolver); };
+  { auto _e = array_indices_type(); _o->array_indices.type = _e; };
+  { auto _e = array_indices(); if (_e) _o->array_indices.value = SparseIndexVectorUnion::UnPack(_e, array_indices_type(), _resolver); };
 }
 
 inline flatbuffers::Offset<DimensionMetadata> DimensionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -10232,13 +10674,17 @@ inline flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(flatbuffer
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _format = _o->format;
   auto _dense_size = _o->dense_size;
-  auto _array_segments = _o->array_segments.size() ? _fbb.CreateVector(_o->array_segments) : 0;
-  auto _array_indices = _o->array_indices.size() ? _fbb.CreateVector(_o->array_indices) : 0;
+  auto _array_segments_type = _o->array_segments.type;
+  auto _array_segments = _o->array_segments.Pack(_fbb);
+  auto _array_indices_type = _o->array_indices.type;
+  auto _array_indices = _o->array_indices.Pack(_fbb);
   return tflite::CreateDimensionMetadata(
       _fbb,
       _format,
       _dense_size,
+      _array_segments_type,
       _array_segments,
+      _array_indices_type,
       _array_indices);
 }
 
@@ -13082,6 +13528,7 @@ inline flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  _fbb.ForceVectorAlignment(_o->data.size(), sizeof(uint8_t), 16);
   auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
   return tflite::CreateBuffer(
       _fbb,
@@ -13230,6 +13677,117 @@ inline void QuantizationDetailsUnion::Reset() {
   type = QuantizationDetails_NONE;
 }
 
+inline bool VerifySparseIndexVector(flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type) {
+  switch (type) {
+    case SparseIndexVector_NONE: {
+      return true;
+    }
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const Int32Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const Uint16Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const Uint8Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifySparseIndexVectorVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifySparseIndexVector(
+        verifier,  values->Get(i), types->GetEnum<SparseIndexVector>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *SparseIndexVectorUnion::UnPack(const void *obj, SparseIndexVector type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const Int32Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const Uint16Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const Uint8Vector *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> SparseIndexVectorUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<const Int32VectorT *>(value);
+      return CreateInt32Vector(_fbb, ptr, _rehasher).Union();
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<const Uint16VectorT *>(value);
+      return CreateUint16Vector(_fbb, ptr, _rehasher).Union();
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<const Uint8VectorT *>(value);
+      return CreateUint8Vector(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline SparseIndexVectorUnion::SparseIndexVectorUnion(const SparseIndexVectorUnion &u) FLATBUFFERS_NOEXCEPT : type(u.type), value(nullptr) {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      value = new Int32VectorT(*reinterpret_cast<Int32VectorT *>(u.value));
+      break;
+    }
+    case SparseIndexVector_Uint16Vector: {
+      value = new Uint16VectorT(*reinterpret_cast<Uint16VectorT *>(u.value));
+      break;
+    }
+    case SparseIndexVector_Uint8Vector: {
+      value = new Uint8VectorT(*reinterpret_cast<Uint8VectorT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void SparseIndexVectorUnion::Reset() {
+  switch (type) {
+    case SparseIndexVector_Int32Vector: {
+      auto ptr = reinterpret_cast<Int32VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    case SparseIndexVector_Uint16Vector: {
+      auto ptr = reinterpret_cast<Uint16VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    case SparseIndexVector_Uint8Vector: {
+      auto ptr = reinterpret_cast<Uint8VectorT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = SparseIndexVector_NONE;
+}
+
 inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index 0ccff131b5e..b0ece0e2d25 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -22,3 +22,7 @@ def ios_visibility_whitelist():
 def tflite_extra_gles_deps():
     """This is a no-op outside of Google."""
     return []
+
+def tflite_ios_lab_runner(version):
+    """This is a no-op outside of Google."""
+    return None
diff --git a/tensorflow/lite/testdata/sparse_tensor.bin b/tensorflow/lite/testdata/sparse_tensor.bin
index 497ce68a3ac..c035e02441d 100644
Binary files a/tensorflow/lite/testdata/sparse_tensor.bin and b/tensorflow/lite/testdata/sparse_tensor.bin differ
diff --git a/tensorflow/lite/testdata/sparse_tensor.json b/tensorflow/lite/testdata/sparse_tensor.json
index ce627e2bb2d..d23c0d0a64b 100644
--- a/tensorflow/lite/testdata/sparse_tensor.json
+++ b/tensorflow/lite/testdata/sparse_tensor.json
@@ -25,8 +25,10 @@
               },
               {
                 "format": "SPARSE_CSR",
-                "array_segments": [0, 2, 3],
-                "array_indices": [0, 1, 1]
+                "array_segments_type": "Uint8Vector",
+                "array_segments": {"values": [0, 2, 3]},
+                "array_indices_type": "Uint8Vector",
+                "array_indices": {"values": [0, 1, 1]}
               },
               {
                 "format": "DENSE",
diff --git a/tensorflow/lite/testing/op_tests/tile.py b/tensorflow/lite/testing/op_tests/tile.py
index f486e059228..49d838c54ec 100644
--- a/tensorflow/lite/testing/op_tests/tile.py
+++ b/tensorflow/lite/testing/op_tests/tile.py
@@ -27,7 +27,7 @@ from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 def make_tile_tests(options):
   """Make a set of tests to do tile."""
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32, tf.bool],
+      "input_dtype": [tf.float32, tf.int32, tf.bool, tf.string],
       "input_shape": [[3, 2, 1], [2, 2, 2]],
       "multiplier_dtype": [tf.int32, tf.int64],
       "multiplier_shape": [[3]]
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index d69c787652e..293fc654084 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -2410,9 +2410,6 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   DCHECK_EQ(node.op(), "UnidirectionalSequenceLstm");
 
   const auto& indices = GetListAttr(node, "_tflite_input_indices");
-  if (indices.i_size() != node.input().size()) {
-    return tensorflow::errors::InvalidArgument("Input size does not match.");
-  }
 
   auto* op = new UnidirectionalSequenceLstmOperator();
 
@@ -2421,20 +2418,38 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   const int kInputsSize = 20;
 
   op->inputs.resize(kInputsSize);
-  std::vector<bool> done(kInputsSize);
-  int idx = 0;
-  for (const string& input : node.input()) {
-    int real_index = indices.i(idx);
-    op->inputs[real_index] = (input);
-    done[real_index] = true;
-    idx++;
-  }
 
-  for (int idx = 0; idx < done.size(); idx++) {
-    if (!done[idx]) {
-      string optional_name = node.name() + "_" + std::to_string(idx);
-      model->CreateOptionalArray(optional_name);
-      op->inputs[idx] = optional_name;
+  if (indices.i_size() != node.input().size()) {
+    // New version, the optional inputs are filled with constant nodes.
+    int count = 0;
+    for (int idx = 0; idx < kInputsSize; ++idx) {
+      if (count < indices.i_size() && indices.i(count) == idx) {
+        // Specified input.
+        op->inputs[idx] = node.input(idx);
+        count++;
+      } else {
+        // Optional input.
+        string optional_name = node.name() + "_" + std::to_string(idx);
+        model->CreateOptionalArray(optional_name);
+        op->inputs[idx] = optional_name;
+      }
+    }
+  } else {  // Legacy version.
+    std::vector<bool> done(kInputsSize);
+    int idx = 0;
+    for (const string& input : node.input()) {
+      int real_index = indices.i(idx);
+      op->inputs[real_index] = (input);
+      done[real_index] = true;
+      idx++;
+    }
+
+    for (int idx = 0; idx < done.size(); idx++) {
+      if (!done[idx]) {
+        string optional_name = node.name() + "_" + std::to_string(idx);
+        model->CreateOptionalArray(optional_name);
+        op->inputs[idx] = optional_name;
+      }
     }
   }
 
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index a696306c8e5..09150d23f37 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -89,7 +89,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kGatherNd, 1}, "1.14.0"},
           {{OperatorType::kSvdf, 1}, "1.5.0"},
           {{OperatorType::kSvdf, 2}, "1.14.0"},
-          {{OperatorType::kSvdf, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kSvdf, 3}, "2.2.0"},
           {{OperatorType::kL2Normalization, 1}, "1.5.0"},
           {{OperatorType::kL2Normalization, 2}, "1.14.0"},
           {{OperatorType::kL2Pool, 1}, "1.5.0"},
@@ -106,6 +106,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kPad, 1}, "1.5.0"},
           {{OperatorType::kPad, 2}, "1.14.0"},
           {{OperatorType::kTile, 1}, "1.10.1"},
+          {{OperatorType::kTile, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kPadV2, 1}, "1.9.0"},
           {{OperatorType::kPadV2, 2}, "1.14.0"},
           {{OperatorType::kReshape, 1}, "1.5.0"},
@@ -137,7 +138,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kRelu6, 2}, "1.14.0"},
           {{OperatorType::kResizeBilinear, 1}, "1.7.0"},
           {{OperatorType::kResizeBilinear, 2}, "1.14.0"},
-          {{OperatorType::kResizeBilinear, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kResizeBilinear, 3}, "2.2.0"},
           {{OperatorType::kResizeNearestNeighbor, 1}, "1.13.1"},
           {{OperatorType::kResizeNearestNeighbor, 2}, "1.14.0"},
           {{OperatorType::kSqueeze, 1}, "1.6.0"},
@@ -171,7 +172,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kCTCBeamSearchDecoder, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 2}, "1.14.0"},
-          {{OperatorType::kUnpack, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kUnpack, 3}, "2.2.0"},
           {{OperatorType::kLeakyRelu, 1}, "1.13.1"},
           {{OperatorType::kLogistic, 1}, "1.14.0"},
           {{OperatorType::kLogistic, 2}, "1.14.0"},
@@ -198,10 +199,10 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kLess, 2}, "1.14.0"},
           {{OperatorType::kLessEqual, 1}, "1.14.0"},
           {{OperatorType::kLessEqual, 2}, "1.14.0"},
-          {{OperatorType::kSegmentSum, 1}, kPendingReleaseOpVersion},
+          {{OperatorType::kSegmentSum, 1}, "2.2.0"},
           {{OperatorType::kSelect, 1}, "1.14.0"},
           {{OperatorType::kSelect, 2}, "1.14.0"},
-          {{OperatorType::kSelectV2, 1}, kPendingReleaseOpVersion},
+          {{OperatorType::kSelectV2, 1}, "2.2.0"},
           {{OperatorType::kFloorDiv, 1}, "1.14.0"},
           {{OperatorType::kFloorDiv, 2}, "1.14.0"},
           {{OperatorType::kFloor, 1}, "1.9.0"},
@@ -232,7 +233,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kHardSwish, 1}, "1.15.0"},
           {{OperatorType::kFill, 1}, "1.13.0"},
           {{OperatorType::kReverseV2, 1}, "1.14.0"},
-          {{OperatorType::kReverseV2, 2}, kPendingReleaseOpVersion},
+          {{OperatorType::kReverseV2, 2}, "2.2.0"},
           {{OperatorType::kRank, 1}, "1.14.0"},
       });
 
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 72968fc8e24..5a413112e2f 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -118,6 +118,7 @@ cc_library(
     deps = [
         ":benchmark_model_lib",
         "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/profiling:profiler",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 23b76a921c5..6b1e9819312 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -185,6 +185,13 @@ std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
   return values;
 }
 
+std::shared_ptr<profiling::ProfileSummaryFormatter>
+CreateProfileSummaryFormatter(bool format_as_csv) {
+  return format_as_csv
+             ? std::make_shared<profiling::ProfileSummaryCSVFormatter>()
+             : std::make_shared<profiling::ProfileSummaryDefaultFormatter>();
+}
+
 }  // namespace
 
 BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
@@ -566,7 +573,9 @@ BenchmarkTfLiteModel::MayCreateProfilingListener() const {
   if (!params_.Get<bool>("enable_op_profiling")) return nullptr;
   return std::unique_ptr<BenchmarkListener>(new ProfilingListener(
       interpreter_.get(), params_.Get<int32_t>("max_profiling_buffer_entries"),
-      params_.Get<std::string>("profiling_output_csv_file")));
+      params_.Get<std::string>("profiling_output_csv_file"),
+      CreateProfileSummaryFormatter(
+          !params_.Get<std::string>("profiling_output_csv_file").empty())));
 }
 
 TfLiteStatus BenchmarkTfLiteModel::RunImpl() { return interpreter_->Invoke(); }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 1d056bdf0cf..a0bcce843ab 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.cc b/tensorflow/lite/tools/benchmark/profiling_listener.cc
index 8d7a0fe3537..50df69c4b7c 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.cc
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.cc
@@ -20,14 +20,15 @@ limitations under the License.
 namespace tflite {
 namespace benchmark {
 
-ProfilingListener::ProfilingListener(Interpreter* interpreter,
-                                     uint32_t max_num_entries,
-                                     const std::string& csv_file_path)
-    : interpreter_(interpreter),
-      profiler_(max_num_entries),
-      run_summarizer_(CreateProfileSummaryFormatter(!csv_file_path.empty())),
-      init_summarizer_(CreateProfileSummaryFormatter(!csv_file_path.empty())),
-      csv_file_path_(csv_file_path) {
+ProfilingListener::ProfilingListener(
+    Interpreter* interpreter, uint32_t max_num_entries,
+    const std::string& csv_file_path,
+    std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter)
+    : run_summarizer_(summarizer_formatter),
+      init_summarizer_(summarizer_formatter),
+      csv_file_path_(csv_file_path),
+      interpreter_(interpreter),
+      profiler_(max_num_entries) {
   TFLITE_BENCHMARK_CHECK(interpreter);
   interpreter_->SetProfiler(&profiler_);
 
@@ -85,12 +86,5 @@ void ProfilingListener::WriteOutput(const std::string& header,
   (*stream) << data << std::endl;
 }
 
-std::unique_ptr<profiling::ProfileSummaryFormatter>
-ProfilingListener::CreateProfileSummaryFormatter(bool format_as_csv) const {
-  return format_as_csv
-             ? std::make_unique<profiling::ProfileSummaryDefaultFormatter>()
-             : std::make_unique<profiling::ProfileSummaryCSVFormatter>();
-}
-
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.h b/tensorflow/lite/tools/benchmark/profiling_listener.h
index 9c0f6745bbb..0b2772baea1 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.h
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
 #define TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
 
+#include <memory>
+
 #include "tensorflow/lite/profiling/buffered_profiler.h"
 #include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
 namespace tflite {
@@ -26,8 +29,11 @@ namespace benchmark {
 // Dumps profiling events if profiling is enabled.
 class ProfilingListener : public BenchmarkListener {
  public:
-  explicit ProfilingListener(Interpreter* interpreter, uint32_t max_num_entries,
-                             const std::string& csv_file_path = "");
+  ProfilingListener(
+      Interpreter* interpreter, uint32_t max_num_entries,
+      const std::string& csv_file_path = "",
+      std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter =
+          std::make_shared<profiling::ProfileSummaryDefaultFormatter>());
 
   void OnBenchmarkStart(const BenchmarkParams& params) override;
 
@@ -38,18 +44,15 @@ class ProfilingListener : public BenchmarkListener {
   void OnBenchmarkEnd(const BenchmarkResults& results) override;
 
  protected:
-  // Allow subclasses to create a customized summary writer during init.
-  virtual std::unique_ptr<profiling::ProfileSummaryFormatter>
-  CreateProfileSummaryFormatter(bool format_as_csv) const;
+  profiling::ProfileSummarizer run_summarizer_;
+  profiling::ProfileSummarizer init_summarizer_;
+  std::string csv_file_path_;
 
  private:
   void WriteOutput(const std::string& header, const string& data,
                    std::ostream* stream);
   Interpreter* interpreter_;
   profiling::BufferedProfiler profiler_;
-  profiling::ProfileSummarizer run_summarizer_;
-  profiling::ProfileSummarizer init_summarizer_;
-  std::string csv_file_path_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index c1a20eccb0a..b78fb14b785 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -56,9 +56,8 @@ LIBS := \
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CXXFLAGS := -O3 -DNDEBUG -fPIC
-CXXFLAGS += $(EXTRA_CXXFLAGS)
-CFLAGS := ${CXXFLAGS}
+CFLAGS := -O3 -DNDEBUG -fPIC
+CXXFLAGS := $(CFLAGS) --std=c++11 $(EXTRA_CXXFLAGS)
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
 TARGET_TOOLCHAIN_PREFIX :=
@@ -68,10 +67,6 @@ ifeq ($(HOST_OS),windows)
 CXXFLAGS += -fext-numeric-literals -D__LITTLE_ENDIAN__
 endif
 
-ifeq ($(TARGET),ios)
-CXXFLAGS += --std=c++11
-endif
-
 # Auto-detect optimization opportunity if building natively.
 ifeq ($(HOST_OS),$(TARGET))
 ifeq ($(HOST_ARCH),$(TARGET_ARCH))
@@ -121,7 +116,7 @@ $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc) \
-$(wildcard tensorflow/lite/experimental/resource_variable/*.cc) \
+$(wildcard tensorflow/lite/experimental/resource/*.cc) \
 $(wildcard tensorflow/lite/experimental/ruy/*.cc)
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
@@ -148,6 +143,7 @@ $(wildcard tensorflow/lite/*/*/benchmark.cc) \
 $(wildcard tensorflow/lite/*/*/example*.cc) \
 $(wildcard tensorflow/lite/*/*/test*.cc) \
 $(wildcard tensorflow/lite/*/*/*test.cc) \
+$(wildcard tensorflow/lite/*/*/*tool.cc) \
 $(wildcard tensorflow/lite/*/*/*/*test.cc) \
 $(wildcard tensorflow/lite/kernels/*test_main.cc) \
 $(wildcard tensorflow/lite/kernels/*test_util*.cc) \
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter.cc b/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
index c640b75176e..2f4284dca82 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter.cc
@@ -275,7 +275,7 @@ void FormatConverter<T>::Populate(const T* src_data, std::vector<int> indices,
     for (; i < indices.size(); i++) {
       int orig_dim = block_map_[traversal_order_[i] - orig_rank];
       orig_idx[orig_dim] =
-          orig_idx[orig_dim] * blocked_shape_[orig_dim] + indices[i];
+          orig_idx[orig_dim] * block_size_[orig_dim] + indices[i];
     }
 
     data_[GetFlattenedIndex(orig_idx, dense_shape_)] = src_data[*src_data_ptr];
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc b/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc
index 8f617cd5c19..4531e7c3341 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter_test.cc
@@ -425,6 +425,39 @@ TEST(FormatConverterTest, BlockTestD0S1LastBlockEmpty) {
   EXPECT_EQ(data_back, dense_values);
 }
 
+TEST(FormatConverterTest, BlockTestD0S1ColMajorBlock) {
+  const std::vector<int> dense_values = {1, 0, 2, 3, 0, 4, 0, 0, 1, 0, 2,
+                                         3, 0, 4, 0, 0, 0, 0, 5, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const std::vector<int> dense_shape = {4, 8};
+  const std::vector<int> traversal_order = {0, 1, 3, 2};
+  const std::vector<TfLiteDimensionType> format = {kTfLiteDimDense,
+                                                   kTfLiteDimSparseCSR};
+  const std::vector<int> block_size = {2, 2};
+  const std::vector<int> block_map = {0, 1};
+  FormatConverter<int> converter(dense_shape, traversal_order, format,
+                                 block_size, block_map);
+  converter.DenseToSparse(dense_values.data());
+
+  const auto& dim_metadata = converter.GetDimMetadata();
+  const std::vector<int> dm = {2};
+  const std::vector<int> dm1_0 = {0, 3, 4};
+  const std::vector<int> dm1_1 = {0, 1, 2, 1};
+  EXPECT_EQ(dm, dim_metadata[0]);
+  EXPECT_EQ(dm1_0, dim_metadata[2]);
+  EXPECT_EQ(dm1_1, dim_metadata[3]);
+  EXPECT_EQ(dm, dim_metadata[4]);
+  EXPECT_EQ(dm, dim_metadata[6]);
+
+  const auto& data = converter.GetData();
+  const std::vector<int> expected_data = {1, 1, 0, 0, 2, 2, 3, 3,
+                                          0, 0, 4, 4, 5, 0, 0, 0};
+  EXPECT_EQ(expected_data, data);
+
+  converter.SparseToDense(expected_data.data());
+  const auto& data_back = converter.GetData();
+  EXPECT_EQ(data_back, dense_values);
+}
 }  // namespace
 }  // namespace sparsity
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index d9b737ba77e..f5b369ea501 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -113,6 +113,65 @@ bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
   return true;
 }
 
+int GetSizeOfSegments(const DimensionMetadata* dim_metadata) {
+  switch (dim_metadata->array_segments_type()) {
+    case SparseIndexVector_Int32Vector:
+      return dim_metadata->array_segments_as_Int32Vector()->values()->size();
+    case SparseIndexVector_Uint16Vector:
+      return dim_metadata->array_segments_as_Uint16Vector()->values()->size();
+    case SparseIndexVector_Uint8Vector:
+      return dim_metadata->array_segments_as_Uint8Vector()->values()->size();
+    default:
+      return -1;
+  }
+}
+
+int GetValueOfSegmentsAt(const DimensionMetadata* dim_metadata, const int i) {
+  switch (dim_metadata->array_segments_type()) {
+    case SparseIndexVector_Int32Vector:
+      return static_cast<int>(
+          dim_metadata->array_segments_as_Int32Vector()->values()->Get(i));
+    case SparseIndexVector_Uint16Vector:
+      return static_cast<int>(
+          dim_metadata->array_segments_as_Uint16Vector()->values()->Get(i));
+    case SparseIndexVector_Uint8Vector:
+      return static_cast<int>(
+          dim_metadata->array_segments_as_Uint8Vector()->values()->Get(i));
+    default:
+      return -1;
+  }
+}
+
+int GetSizeOfIndices(const DimensionMetadata* dim_metadata) {
+  switch (dim_metadata->array_indices_type()) {
+    case SparseIndexVector_Int32Vector:
+      return dim_metadata->array_indices_as_Int32Vector()->values()->size();
+    case SparseIndexVector_Uint16Vector:
+      return dim_metadata->array_indices_as_Uint16Vector()->values()->size();
+    case SparseIndexVector_Uint8Vector:
+      return dim_metadata->array_indices_as_Uint8Vector()->values()->size();
+    default:
+      return -1;
+  }
+}
+
+int GetValueOfIndicesAt(const DimensionMetadata* dim_metadata, const int i) {
+  switch (dim_metadata->array_indices_type()) {
+    case SparseIndexVector_Int32Vector:
+      return static_cast<int>(
+          dim_metadata->array_indices_as_Int32Vector()->values()->Get(i));
+    case SparseIndexVector_Uint16Vector:
+      return static_cast<int>(
+          dim_metadata->array_indices_as_Uint16Vector()->values()->Get(i));
+    case SparseIndexVector_Uint8Vector:
+      return static_cast<int>(
+          dim_metadata->array_indices_as_Uint8Vector()->values()->Get(i));
+    default:
+      return -1;
+  }
+  return -1;
+}
+
 // The sparsity parameter defines a tree structure to map each non-zero element
 // stored in the flattened buffer back to its index in the conceptual dense
 // tensor.
@@ -139,31 +198,36 @@ absl::optional<uint64_t> VerifyAndCountElements(
         return absl::nullopt;
       }
 
-      for (int j = 0; j < array_segments->size() - 1; j++) {
-        if (array_segments->Get(j) < 0 || array_segments->Get(j + 1) < 0 ||
-            array_segments->Get(j) > array_segments->Get(j + 1)) {
+      int array_segments_size = GetSizeOfSegments(dim_metadata);
+      int array_indices_size = GetSizeOfIndices(dim_metadata);
+
+      for (int j = 0; j < array_segments_size - 1; j++) {
+        if (GetValueOfSegmentsAt(dim_metadata, j) < 0 ||
+            GetValueOfSegmentsAt(dim_metadata, j + 1) < 0 ||
+            GetValueOfSegmentsAt(dim_metadata, j) >
+                GetValueOfSegmentsAt(dim_metadata, j + 1)) {
           return absl::nullopt;
         }
       }
 
-      if (num_elements != array_segments->size() - 1) {
+      if (num_elements != array_segments_size - 1) {
         return absl::nullopt;
       }
 
-      if (array_indices->size() !=
-          array_segments->Get(array_segments->size() - 1)) {
+      if (array_indices_size !=
+          GetValueOfSegmentsAt(dim_metadata, array_segments_size - 1)) {
         return absl::nullopt;
       }
 
-      for (int j = 0; j < array_indices->size(); j++) {
-        if (array_indices->Get(j) < 0 ||
-            array_indices->Get(j) >= dim_sizes[original_dim]) {
+      for (int j = 0; j < array_indices_size; j++) {
+        if (GetValueOfIndicesAt(dim_metadata, j) < 0 ||
+            GetValueOfIndicesAt(dim_metadata, j) >= dim_sizes[original_dim]) {
           return absl::nullopt;
         }
       }
 
       // Need to reset num_elements when seeing a sparse dimension.
-      num_elements = array_indices->size();
+      num_elements = array_indices_size;
     }
   }
 
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index 355ee6640c6..1e13fda7c33 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -613,7 +613,8 @@ TEST(VerifyModel, InvalidSparseTensorIndexOutOfBound) {
   scoped_model.reset(model->GetModel()->UnPack());
 
   auto* tensor = scoped_model->subgraphs[0]->tensors[0].get();
-  tensor->sparsity->dim_metadata[1]->array_indices[1] = 5;
+  tensor->sparsity->dim_metadata[1]->array_indices.AsUint8Vector()->values[1] =
+      5;
 
   flatbuffers::FlatBufferBuilder builder;
   auto model_ = Model::Pack(builder, scoped_model.get());
@@ -693,8 +694,10 @@ TEST(VerifyModel, ValidSparseTensorBCSC) {
   tensor->sparsity->dim_metadata[0]->dense_size = 2;
 
   tensor->sparsity->dim_metadata[1]->format = DimensionType_SPARSE_CSR;
-  tensor->sparsity->dim_metadata[1]->array_segments = {0, 1, 3};
-  tensor->sparsity->dim_metadata[1]->array_indices = {0, 0, 1};
+  tensor->sparsity->dim_metadata[1]->array_segments.AsUint8Vector()->values = {
+      0, 1, 3};
+  tensor->sparsity->dim_metadata[1]->array_indices.AsUint8Vector()->values = {
+      0, 0, 1};
 
   tensor->sparsity->dim_metadata[2]->format = DimensionType_DENSE;
   tensor->sparsity->dim_metadata[2]->dense_size = 2;
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 77c39ff7073..b699f0dbc9b 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -287,6 +287,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_TILE:
+      if (op_sig.input_types.at(0) == TensorType_STRING) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_AVERAGE_POOL_2D:
     case BuiltinOperator_ADD:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index b417fc5c47d..8cd873aa697 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -432,4 +432,17 @@ TEST(OpVersionTest, VersioningDepthwiseConv2DTest) {
   fake_op_sig.options.depthwise_conv_2d.dilation_h_factor = 1;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
+TEST(OpVersionTest, VersioningTileOperatorTest) {
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_TILE,
+      .input_types = std::vector<TensorType>{TensorType_INT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_TILE,
+      .input_types = std::vector<TensorType>{TensorType_STRING},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+}
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
index b78695be5a5..1f89f9c5448 100644
--- a/tensorflow/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -265,7 +265,9 @@ class TensorMapper(object):
       html += str(i) + " "
       html += NameListToString(tensor["name"]) + " "
       html += TensorTypeToName(tensor["type"]) + " "
-      html += (repr(tensor["shape"]) if "shape" in tensor else "[]") + "<br>"
+      html += (repr(tensor["shape"]) if "shape" in tensor else "[]")
+      html += (repr(tensor["shape_signature"])
+               if "shape_signature" in tensor else "[]") + "<br>"
     html += "</span>"
     html += repr(x)
     html += "</span>"
@@ -447,9 +449,9 @@ def CreateHtmlFile(tflite_input, html_output):
                           ("builtin_options", None),
                           ("opcode_index", opcode_mapper)]
     tensor_keys_to_display = [("name", NameListToString),
-                              ("type", TensorTypeToName),
-                              ("shape", None),
-                              ("buffer", None), ("quantization", None)]
+                              ("type", TensorTypeToName), ("shape", None),
+                              ("shape_signature", None), ("buffer", None),
+                              ("quantization", None)]
 
     html += "<h2>Subgraph %d</h2>\n" % subgraph_idx
 
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index bba10464933..8ee24498ef0 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -215,33 +215,11 @@ tensorflow/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
 tensorflow/third_party/toolchains/cpus/py/BUILD
 tensorflow/third_party/toolchains/cpus/py3/BUILD
 tensorflow/third_party/toolchains/java/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/py/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/generate/BUILD
 tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
 tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
@@ -259,7 +237,6 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0/build_defs.bzl
@@ -357,6 +334,9 @@ tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
+tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
 tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
@@ -376,6 +356,10 @@ tensorflow/tools/ci_build/release/windows/cpu_py37_full/nightly_release.bat
 tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
 tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_pip_rename.sh
 tensorflow/tools/ci_build/release/windows/cpu_py37_full/release_v1.bat
+tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
+tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
+tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
+tensorflow/tools/ci_build/release/windows/cpu_py38_full/release_pip_rename.sh
 tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/nightly.bat
 tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/release.bat
 tensorflow/tools/ci_build/release/windows/gpu_pip_on_cpu/build.bat
@@ -394,6 +378,10 @@ tensorflow/tools/ci_build/release/windows/gpu_py37_full/nightly_release.bat
 tensorflow/tools/ci_build/release/windows/gpu_py37_full/release.bat
 tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_pip_rename.sh
 tensorflow/tools/ci_build/release/windows/gpu_py37_full/release_v1.bat
+tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
+tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
+tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
+tensorflow/tools/ci_build/release/windows/gpu_py38_full/release_pip_rename.sh
 tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
 tensorflow/tools/ci_build/remote/BUILD
 tensorflow/tools/def_file_filter/BUILD
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 15d21d34bc5..9c8f4227d12 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -623,10 +623,9 @@ tf_python_pybind_extension(
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:core_cpu_headers_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_headers_for_pybind",
         "@com_google_absl//absl/types:optional",
     ] + if_static(
         extra_deps = [
@@ -2147,9 +2146,7 @@ tf_py_test(
 
 tf_gen_op_wrapper_private_py(
     name = "functional_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-    ],
+    visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 py_library(
@@ -2560,6 +2557,9 @@ tf_py_test(
         "no_oss",
         "no_pip",
         "no_windows",
+        "noasan",  # TODO(b/149948895): Re-enable.
+        "nomsan",  # TODO(b/149948895): Re-enable.
+        "notsan",  # TODO(b/149948895): Re-enable.
     ],
     deps = [
         ":framework_test_lib",
@@ -2860,9 +2860,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "parsing_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-    ],
+    visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -2884,6 +2882,7 @@ tf_gen_op_wrapper_private_py(
     name = "resource_variable_ops_gen",
     visibility = [
         "//tensorflow/compiler/tf2xla:internal",
+        "//tensorflow/python/distribute:__pkg__",
     ],
 )
 
@@ -5824,6 +5823,7 @@ filegroup(
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
         "//tensorflow/c:python_api",  # tf_session
         "//tensorflow/c:tf_status_helper",  # tfe
+        "//tensorflow/compiler/jit:flags",  #tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
         "//tensorflow/core:core_cpu_base_no_ops",  # tf_session
         "//tensorflow/core:core_cpu_impl",  # device_lib
@@ -7496,7 +7496,7 @@ tf_python_pybind_extension(
         ":pybind11_status",
         "@pybind11",
         "//tensorflow/core:core_cpu_headers_lib",
-        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_id",
         "//tensorflow/core:protos_all_cc",
     ] + if_not_windows(["//tensorflow/core/grappler/costs:graph_properties"]),  # b/148556093,
@@ -7571,9 +7571,9 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_status",
         "//tensorflow/core:core_cpu_headers_lib",
-        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_id",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/types:span",
         "@pybind11",
@@ -7632,9 +7632,9 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_status",
         "//tensorflow/core:core_cpu_headers_lib",
-        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_id",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
         "@pybind11",
     ],
@@ -8048,6 +8048,7 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:optional",
         "@pybind11",
         "//third_party/python_runtime:headers",
+        "//tensorflow/compiler/jit:flags_headers_only",
         "//tensorflow/core:core_cpu_headers_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -8056,13 +8057,11 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform:platform",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/compiler/jit:flags",
             "//tensorflow/core:eager_service_proto_cc",
             "//tensorflow/core:master_proto_cc",
             "//tensorflow/core:worker_proto_cc",
         ],
         otherwise = [
-            "//tensorflow/compiler/jit:flags_headers_only",
             "//tensorflow/core:eager_service_proto_cc_headers_only",
             "//tensorflow/core:master_proto_cc_headers_only",
             "//tensorflow/core:worker_proto_cc_headers_only",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 6d88cb566ae..c54b58c6259 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -48,17 +48,6 @@ import traceback
 import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python import _pywrap_utils
-from tensorflow.python import _pywrap_tfprof
-from tensorflow.python import _pywrap_events_writer
-from tensorflow.python import _pywrap_util_port
-from tensorflow.python import _pywrap_stat_summarizer
-from tensorflow.python import _pywrap_py_exception_registry
-from tensorflow.python import _pywrap_python_op_gen
-from tensorflow.python import _pywrap_kernel_registry
-from tensorflow.python import _pywrap_quantize_training
-from tensorflow.python import _pywrap_transform_graph
-from tensorflow.python import _pywrap_stacktrace_handler
 
 # Protocol buffers
 from tensorflow.core.framework.graph_pb2 import *
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e4638ead571..4af0c9eb259 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 25)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 2ea22ebba49..352a080debb 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -113,10 +113,13 @@ DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30
 
 
 @tf_export("experimental.tensorrt.ConversionParams", v1=[])
-class TrtConversionParams(collections.namedtuple("TrtConversionParams", [
-    "rewriter_config_template", "max_workspace_size_bytes", "precision_mode",
-    "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "use_calibration", "max_batch_size"])):
+class TrtConversionParams(
+    collections.namedtuple("TrtConversionParams", [
+        "rewriter_config_template", "max_workspace_size_bytes",
+        "precision_mode", "minimum_segment_size", "is_dynamic_op",
+        "maximum_cached_engines", "use_calibration", "max_batch_size",
+        "allow_build_at_runtime"
+    ])):
   """Parameters that are used for TF-TRT conversion.
 
   Fields:
@@ -151,6 +154,11 @@ class TrtConversionParams(collections.namedtuple("TrtConversionParams", [
       tensors were trained with fake quantization.
     max_batch_size: max size for the input batch. This parameter is only
       effective when is_dynamic_op=False which is not supported in TF 2.0.
+    allow_build_at_runtime: whether to build TensorRT engines during runtime.
+      If no TensorRT engine can be found in cache that can handle the given
+      inputs during runtime, then a new TensorRT engine is built at runtime if
+      allow_build_at_runtime=True, and otherwise native TF is used. This
+      argument is only effective if is_dynamic_op=True.
   """
 
   def __new__(cls,
@@ -161,11 +169,14 @@ class TrtConversionParams(collections.namedtuple("TrtConversionParams", [
               is_dynamic_op=True,
               maximum_cached_engines=1,
               use_calibration=True,
-              max_batch_size=1):
-    return super(TrtConversionParams, cls).__new__(
-        cls, rewriter_config_template, max_workspace_size_bytes, precision_mode,
-        minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-        use_calibration, max_batch_size)
+              max_batch_size=1,
+              allow_build_at_runtime=True):
+    return super(TrtConversionParams,
+                 cls).__new__(cls, rewriter_config_template,
+                              max_workspace_size_bytes, precision_mode,
+                              minimum_segment_size, is_dynamic_op,
+                              maximum_cached_engines, use_calibration,
+                              max_batch_size, allow_build_at_runtime)
 
 
 DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams()
@@ -228,6 +239,13 @@ def _check_conversion_params(conversion_params, is_v2=False):
           not trt_optimizer.parameter_map["is_dynamic_op"]):
         raise ValueError("Option is_dynamic_op=False is not supported "
                          "in TF 2.0, please set it to True instead.")
+  if (conversion_params.allow_build_at_runtime and
+      not conversion_params.is_dynamic_op):
+    tf_logging.warn(
+        ("Building TensorRT engines at runtime is not supported "
+         "if is_dynamic_op=False, therefore assuming "
+         "allow_build_at_runtime=False. If building TensorRT engines "
+         "at runtime is desired, set is_dynamic_op=True."))
 
 
 def _check_trt_version_compatibility():
@@ -320,6 +338,8 @@ def get_tensorrt_rewriter_config(conversion_params,
     optimizer.parameter_map[
         "use_calibration"].b = conversion_params.use_calibration
     optimizer.parameter_map["is_dynamic_op"].b = conversion_params.is_dynamic_op
+    optimizer.parameter_map[
+        "allow_build_at_runtime"].b = conversion_params.allow_build_at_runtime
     if not is_v2:
       optimizer.parameter_map[
           "max_batch_size"].i = conversion_params.max_batch_size
@@ -505,7 +525,8 @@ class TrtGraphConverter(object):
         is_dynamic_op=is_dynamic_op,
         maximum_cached_engines=maximum_cached_engines,
         use_calibration=use_calibration,
-        max_batch_size=max_batch_size)
+        max_batch_size=max_batch_size,
+        allow_build_at_runtime=True)
     _check_conversion_params(self._conversion_params)
 
   def _run_conversion(self):
@@ -1165,6 +1186,28 @@ class TrtGraphConverterV2(object):
     signatures = {
         key: value for key, value in self._saved_model.signatures.items()
     }
+
+    # Set allow_build_at_runtime=False if asked by user.
+    #
+    # This attribute is set here because build() needs it to be True in order to
+    # build engines.
+    if not self._conversion_params.allow_build_at_runtime:
+
+      def _reset_allow_build_at_runtime(node):
+        node.attr["allow_build_at_runtime"].b = False
+
+      self._for_each_trt_node(self._converted_graph_def,
+                              _reset_allow_build_at_runtime)
+      # Rebuild the function since a node attribute changed above
+      reset_converted_func = wrap_function.function_from_graph_def(
+          self._converted_graph_def,
+          [tensor.name for tensor in self._converted_func.inputs],
+          [tensor.name for tensor in self._converted_func.outputs])
+      reset_converted_func.graph.structured_outputs = nest.pack_sequence_as(
+          self._converted_func.graph.structured_outputs,
+          reset_converted_func.graph.structured_outputs)
+      self._converted_func = reset_converted_func
+
     signatures[self._input_saved_model_signature_key] = self._converted_func
     save.save(self._saved_model, output_saved_model_dir, signatures)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
index 35134014e2c..3219c5a8bc9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
@@ -38,7 +38,7 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testParallelMap(self):
     dataset = dataset_ops.Dataset.range(100)
     parallel_map = "ParallelMap"
-    if compat.forward_compatible(2020, 2, 20):
+    if compat.forward_compatible(2020, 3, 6):
       parallel_map = "ParallelMapV2"
     dataset = dataset.apply(
         testing.assert_next([parallel_map, "Prefetch", "FiniteTake"]))
@@ -87,7 +87,7 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
     if compat.forward_compatible(2020, 3, 6):
       parallel_interleave = "ParallelInterleaveV4"
     parallel_map = "ParallelMap"
-    if compat.forward_compatible(2020, 2, 20):
+    if compat.forward_compatible(2020, 3, 6):
       parallel_map = "ParallelMapV2"
     dataset = dataset.apply(
         testing.assert_next([
@@ -114,5 +114,6 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = self._enable_autotune_buffers(dataset)
     self.assertDatasetProduces(dataset, range(1, 51))
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index bc56beff95c..4b3f811119b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -223,7 +223,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     map_node_name = "Map"
     if num_parallel_calls is not None:
       map_node_name = "ParallelMap"
-      if compat.forward_compatible(2020, 2, 20):
+      if compat.forward_compatible(2020, 3, 6):
         map_node_name = "ParallelMapV2"
 
     def _make_dataset(node_names):
@@ -523,12 +523,14 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     def make_apply_fn(is_fused):
       if is_fused:
+
         def apply_fn(dataset):
           return dataset.apply(
               batching.map_and_batch(map_fn, 2, 12, drop_remainder=True))
 
         return apply_fn
       else:
+
         def apply_fn(dataset):
           return dataset.map(map_fn, 12).batch(2, drop_remainder=True)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 96b3b764864..535cf884dc6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -161,17 +161,49 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testWriteSnapshotRepeatAfterwards(self):
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(compression=[
+              snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+              snapshot.COMPRESSION_SNAPPY
+          ])))
+  def testWriteSnapshotRepeatAfterwards(self, compression):
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(compression=[
+              snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+              snapshot.COMPRESSION_SNAPPY
+          ])))
+  def testWriteSnapshotMixTypes(self, compression):
+    tmpdir = self.snapshot_dir
+
+    dataset = dataset_ops.Dataset.range(10)
+
+    def map_fn(x):
+      return (x, string_ops.as_string(x), string_ops.as_string(2 * x), 2 * x)
+
+    dataset = dataset.map(map_fn)
+    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.repeat(10)
+
+    expected = []
+    for i in range(10):
+      expected.append((i, str(i), str(2 * i), 2 * i))
+    self.assertDatasetProduces(dataset, expected * 10)
+
+    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
+
   @combinations.generate(test_base.default_test_combinations())
   def testSpecifySnapshotNameWriteAndRead(self):
     tmpdir = self.snapshot_dir
@@ -365,8 +397,14 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       res3 = self.evaluate(next3())
       self.assertEqual(res2, res3)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testReadSnapshotParallelAfterWrite(self):
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(compression=[
+              snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+              snapshot.COMPRESSION_SNAPPY
+          ])))
+  def testReadSnapshotParallelAfterWrite(self, compression):
     self.setUpTFRecord(10, 4000)
     filenames = self.test_filenames
 
@@ -383,7 +421,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
-            reader_buffer_size=10))
+            reader_buffer_size=10,
+            compression=compression))
     self.assertDatasetProduces(dataset, expected, assert_items_equal=True)
 
     # remove the original files and try to read the data back only from
@@ -396,7 +435,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
-            reader_buffer_size=10))
+            reader_buffer_size=10,
+            compression=compression))
     self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
 
   # Not testing Snappy here because Snappy reads currently require a lot of
@@ -514,21 +554,31 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       self.evaluate(next2())
     self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testSpecifyShardSize(self):
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(compression=[
+              snapshot.COMPRESSION_NONE, snapshot.COMPRESSION_GZIP,
+              snapshot.COMPRESSION_SNAPPY
+          ])))
+  def testSpecifyShardSize(self, compression):
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.from_tensor_slices([1.0])
     dataset = dataset.map(lambda x: gen_array_ops.broadcast_to(x, [1024, 1024]))
     dataset = dataset.repeat(10)
     dataset = dataset.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10 * 1024 * 1024))
+        snapshot.snapshot(
+            tmpdir, shard_size_bytes=10 * 1024 * 1024, compression=compression))
     next_fn = self.getNext(dataset)
 
     for _ in range(10):
       self.evaluate(next_fn())
 
-    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 3)
+    num_files = 1
+    if compression == snapshot.COMPRESSION_NONE:
+      num_files = 3
+    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, num_files)
 
   @combinations.generate(test_base.default_test_combinations())
   def testAdditionalOperationsAfterReadBack(self):
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index beec8c3bd6b..e42da988989 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -224,12 +225,20 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchSparseError(self):
 
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0]], values=([42]), dense_shape=[1, 1])
 
     with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+      _ = dataset_ops.Dataset.from_tensors(st).repeat(10).padded_batch(10)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testPaddedBatchRaggedError(self):
+
+    rt = ragged_tensor_value.RaggedTensorValue(
+        np.array([0, 42]), np.array([0, 2], dtype=np.int64))
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.from_tensors(rt).repeat(10).padded_batch(10)
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorWrongRank(self):
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 4b25eb3a273..2bd34b195e4 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -39,7 +39,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
-from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
@@ -888,9 +887,9 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
 
     Args:
       *args: follows the same semantics as python's xrange.
-        len(args) == 1 -> start = 0, stop = args[0], step = 1
-        len(args) == 2 -> start = args[0], stop = args[1], step = 1
-        len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
+        len(args) == 1 -> start = 0, stop = args[0], step = 1.
+        len(args) == 2 -> start = args[0], stop = args[1], step = 1.
+        len(args) == 3 -> start = args[0], stop = args[1], step = args[2].
       **kwargs:
         - output_type: Its expected dtype. (Optional, default: `tf.int64`).
 
@@ -1478,8 +1477,8 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
       # bool(tf.TensorShape(None)) is False
       if not all(nest.flatten(padded_shapes)):
         raise ValueError("You must set the `padded_shapes` argument to "
-                         "`Dataset.padded_batch` if any component of its input"
-                         "has an unknown rank")
+                         "`Dataset.padded_batch` if any component of its "
+                         "input has an unknown rank")
     return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values,
                               drop_remainder)
 
@@ -3857,10 +3856,13 @@ class PaddedBatchDataset(UnaryDataset):
                drop_remainder):
     """See `Dataset.batch()` for details."""
     self._input_dataset = input_dataset
-    if sparse.any_sparse(get_legacy_output_classes(input_dataset)):
-      # TODO(b/63669786): support batching of sparse tensors
-      raise TypeError(
-          "Batching of padded sparse tensors is not currently supported")
+
+    def check_types(component_spec):
+      if not isinstance(component_spec, tensor_spec.TensorSpec):
+        raise TypeError("Padded batching of components of type ",
+                        type(component_spec), " is not supported.")
+
+    nest.map_structure(check_types, input_dataset.element_spec)
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d3fa08ffddf..668af74acf6 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -571,8 +571,8 @@ class OwnedIterator(trackable.Trackable, composite_tensor.CompositeTensor):
         `components` and `element_spec` is provided.
     """
 
-    error_message = "Either `dataset` or both `components` and "
-    "`element_spec` need to be provided."
+    error_message = ("Either `dataset` or both `components` and "
+                     "`element_spec` need to be provided.")
 
     self._device = context.context().device_name
 
diff --git a/tensorflow/python/debug/lib/dumping_callback_test.py b/tensorflow/python/debug/lib/dumping_callback_test.py
index 5382965ebc4..b76077e8def 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test.py
@@ -1128,7 +1128,7 @@ class TracingCallbackTest(
         # 1st element: tensor ID; 2nd element: 0 indicating no inf or nan.
         self.assertAllClose(trace.debug_tensor_value, [tensor_id, 0])
     elif tensor_debug_mode == "CONCISE_HEALTH":
-      for tensor_value in tensor_values:
+      for trace in graph_exec_traces:
         tensor_id = reader.graph_execution_trace_to_tensor_id(trace)
         # 1st element: tensor ID.
         # 2nd element: element count. Remaining elements: all zero because there
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 461365b4b45..2ff8c897c80 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -66,6 +66,7 @@ py_library(
         ":cross_device_utils",
         ":device_util",
         ":reduce_util",
+        ":tpu_values",
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device_lib",
@@ -210,7 +211,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
         "//tensorflow/python:session",
-        "//tensorflow/python:training_lib",
+        "//tensorflow/python:training_server_lib",
     ],
 )
 
@@ -401,7 +402,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training_lib",
+        "//tensorflow/python:training_server_lib",
     ],
 )
 
@@ -531,6 +532,7 @@ py_library(
         ":input_lib",
         ":numpy_dataset",
         ":reduce_util",
+        ":tpu_values",
         ":values",
         "//tensorflow/compiler/xla/experimental/xla_sharding",
         "//tensorflow/python:array_ops",
@@ -612,17 +614,36 @@ py_library(
     deps = [
         ":device_util",
         ":distribute_lib",
+        ":reduce_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:composite_tensor",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
-        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_values",
+    srcs = ["tpu_values.py"],
+    deps = [
+        ":values",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/tpu:tpu_lib",
     ],
 )
 
@@ -790,8 +811,7 @@ cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     tags = [
-        # TODO(b/138143527): Re-enable after fixing Guitar failure.
-        # "multi_and_single_gpu",
+        "multi_and_single_gpu",
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -883,7 +903,7 @@ distribute_py_test(
     srcs = ["values_test.py"],
     main = "values_test.py",
     tags = [
-        "no_oss",  # http://b/119349471
+        "multi_and_single_gpu",
     ],
     deps = [
         ":mirrored_strategy",
@@ -946,6 +966,7 @@ distribute_py_test(
     name = "custom_training_loop_input_test",
     srcs = ["custom_training_loop_input_test.py"],
     main = "custom_training_loop_input_test.py",
+    shard_count = 5,
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1146,7 +1167,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_windows_gpu",  # TODO(b/130551176)
-        "noguitar",
     ],
     deps = [
         ":combinations",
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 78589762ae5..62759dd0853 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018-2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 import subprocess
+import re
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
@@ -28,87 +28,235 @@ from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.util.tf_export import tf_export
 
 
+def expand_hostlist(hostlist):
+  """Create a list of hosts out of a SLURM hostlist
+
+  The order of nodes is preserved and no deduplication is done
+  Input: 'n[1-2],m5,o[3-4,6,7-9]')
+  Output: ['n1', 'n2', 'm5', 'o3', 'o4', 'o6', 'o7', 'o8', 'o9']
+  """
+
+  def split_hostlist(hostlist):
+    """Split hostlist at commas outside of range expressions ('[3-5]')"""
+    in_brackets = False
+    cur_host = ''
+    for c in hostlist:
+      if in_brackets:
+        assert c != '['
+        if c == ']':
+          in_brackets = False
+      elif c == '[':
+        in_brackets = True
+      elif c == ',':
+        assert cur_host != ''
+        yield cur_host
+        cur_host = ''
+        continue
+      cur_host += c
+    if cur_host:
+      yield cur_host
+
+  def expand_range_expression(range_exp):
+    """Expand a range expression like '3-5' to values 3,4,5"""
+    for part in range_exp.split(','):
+      sub_range = part.split('-')
+      if len(sub_range) == 1:
+        sub_range = sub_range * 2
+      else:
+        assert len(sub_range) == 2
+      for i in range(int(sub_range[0]), int(sub_range[1]) + 1):
+        yield i
+
+  hosts = []
+  try:
+    for part in split_hostlist(hostlist):
+      # Match prefix (anything but a range expression) and range expression
+      # Both are optional
+      m = re.match(r'([^,[\]]*)(\[([^\]]+)\])?$', part)
+      if m is None:
+        raise ValueError('Invalid part: %s' % part)
+      prefix = m.group(1) or ''
+      if m.group(3) is None:
+        hosts.append(prefix)
+      else:
+        hosts.extend(
+            prefix + str(i) for i in expand_range_expression(m.group(3)))
+  except Exception as e:
+    raise ValueError('Invalid hostlist format "%s": %s' % (hostlist, e))
+  return hosts
+
+
+def expand_tasks_per_node(tasks_per_node):
+  """Expand the tasks per node expression from SLURM
+
+  The order is preserved so it can be matched to the hostlist
+  Input: '3(x2),2,1'
+  Output: [3, 3, 2, 1]
+  """
+  result = []
+  try:
+    for part in tasks_per_node.split(','):
+      m = re.match(r'(\d+)(\(x(\d+)\))?$', part)
+      assert m is not None
+      num_tasks = int(m.group(1))
+      num_repetitions = int(m.group(3) or 1)
+      result.extend([num_tasks] * num_repetitions)
+  except Exception as e:
+    raise ValueError('Invalid tasks-per-node list format "%s": %s' %
+                     (tasks_per_node, e))
+  return result
+
+
+def _get_slurm_var(name):
+  """Get the SLURM variable from the environment
+
+  Args:
+    name: Name of the step variable
+
+  Returns:
+    SLURM_<name> from os.environ
+  Raises:
+    RuntimeError if variable is not found
+  """
+  name = 'SLURM_' + name
+  try:
+    return os.environ[name]
+  except KeyError:
+    raise RuntimeError('%s not found in environment. '
+                       'Not running inside a SLURM step?' % name)
+
+
+def get_num_slurm_tasks():
+  """Return the number of SLURM tasks of the current job step
+
+  Returns:
+    The number of tasks as an int
+  """
+  return int(_get_slurm_var('STEP_NUM_TASKS'))
+
+
+def _get_num_nvidia_gpus():
+  """Get the number of NVIDIA GPUs by using CUDA_VISIBLE_DEVICES and nvidia-smi
+
+  Returns:
+    Number of GPUs available on the node
+  Raises:
+    RuntimeError if executing nvidia-smi failed
+  """
+  try:
+    return len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
+  except KeyError:
+    pass  # Ignore and fallback to using nvidia-smi
+  try:
+    output = subprocess.check_output(['nvidia-smi', '--list-gpus'],
+                                     encoding='utf-8')
+    return sum(l.startswith('GPU ') for l in output.strip().split('\n'))
+  except subprocess.CalledProcessError as e:
+    raise RuntimeError('Could not get number of GPUs from nvidia-smi. '
+                       'Maybe it is missing?\nOutput: %s' % e.output)
+
+
+def get_num_gpus():
+  """Return the number of GPUs visible on the current node
+
+  Currently only implemented for NVIDIA GPUs
+  """
+  return _get_num_nvidia_gpus()
+
+
 @tf_export('distribute.cluster_resolver.SlurmClusterResolver')
 class SlurmClusterResolver(ClusterResolver):
   """ClusterResolver for system with Slurm workload manager.
 
-  This is an implementation of cluster resolvers for Slurm clusters. This allows
-  the specification of jobs and task counts, number of tasks per node, number of
-  GPUs on each node and number of GPUs for each task. It retrieves system
+  This is an implementation of ClusterResolver for Slurm clusters. This allows
+  the specification of jobs and task counts, number of tasks per node, number
+  of GPUs on each node and number of GPUs for each task. It retrieves system
   attributes by Slurm environment variables, resolves allocated computing node
   names, constructs a cluster and returns a ClusterResolver object which can be
-  use for distributed TensorFlow.
+  used for distributed TensorFlow.
   """
 
-  def _resolve_hostnames(self):
-    """Resolve host names of nodes allocated in current jobs.
-
-    Returns:
-      A list of node names as strings.
-    """
-    hostlist = (subprocess.check_output(['scontrol', 'show', 'hostname']).
-                decode('utf-8').strip().split('\n'))
-    return hostlist
 
   def __init__(self,
-               jobs,
+               jobs=None,
                port_base=8888,
-               gpus_per_node=1,
-               gpus_per_task=1,
+               gpus_per_node=None,
+               gpus_per_task=None,
                tasks_per_node=None,
                auto_set_gpu=True,
                rpc_layer='grpc'):
     """Creates a new SlurmClusterResolver object.
 
-    This takes in parameters and creates a SlurmClusterResolver object. It uses
-    those parameters to check which nodes will processes reside on and resolves
-    their hostnames. With the number of the GPUs on each node and number of GPUs
-    for each task it offsets the port number for each process and allocates
-    GPUs to tasks by setting environment variables. The resolver currently
-    supports homogeneous tasks and default Slurm process allocation.
+    For any parameter not set it will query the environment for the value.
+    It uses those parameters to check which nodes have processes reside on and
+    resolves their hostnames.
+    With the number tasks per node it offsets the port number for each process.
+    With the number of GPUs per node and per task it allocates GPUs to tasks by
+    setting environment variables.
+    Using the resolver works best (and is easier) with homogeneous tasks but
+    heterogeneous tasks (number of tasks varying per node) are also possible as
+    long as the number of GPUs per task stays constant.
+
+    Used environment variables:
+      - SLURM_PROCID
+      - (opt) SLURM_STEP_NUM_TASKS
+      - (opt) SLURM_STEP_NODELIST
+      - (opt) SLURM_TASKS_PER_NODE
 
     Args:
       jobs: Dictionary with job names as key and number of tasks in the job as
-        value.
+        value. Defaults to as many 'worker's as there are (Slurm) tasks.
       port_base: The first port number to start with for processes on a node.
-      gpus_per_node: Number of GPUs available on each node.
-      gpus_per_task: Number of GPUs to be used for each task.
-      tasks_per_node: Number of tasks to run on each node, if not set defaults
-        to Slurm's output environment variable SLURM_NTASKS_PER_NODE.
+      gpus_per_node: Number of GPUs available on each node. Defaults to the
+        number of GPUs reported by nvidia-smi
+      gpus_per_task: Number of GPUs to be used for each task. Default is to
+        evenly distribute the gpus_per_node to tasks_per_node.
+      tasks_per_node: Number of tasks running on each node. Can be an integer if
+        the number of tasks per node is constant or a dictionary mapping
+        hostnames to number of tasks on that node. If not set the Slurm
+        environment is queried for the correct mapping.
       auto_set_gpu: Set the visible CUDA devices automatically while resolving
         the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
         Defaults to True.
-      rpc_layer: (Optional) The protocol TensorFlow uses to communicate between
-        nodes. Defaults to 'grpc'.
+      rpc_layer: The protocol TensorFlow used to communicate between nodes.
+        Defaults to 'grpc'.
 
     Returns:
       A ClusterResolver object which can be used with distributed TensorFlow.
 
     Raises:
-      RuntimeError: If requested more GPUs per node then available or requested
-      more tasks then assigned tasks.
+      RuntimeError: If requested more GPUs per node then available or
+        requested more tasks then assigned tasks or
+        resolving missing values from the environment failed.
     """
 
-    # check if launched by mpirun
-    if 'OMPI_COMM_WORLD_RANK' in os.environ:
-      self._rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-      num_tasks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-    else:
-      self._rank = int(os.environ['SLURM_PROCID'])
-      num_tasks = int(os.environ['SLURM_NTASKS'])
+    self._rank = self._resolve_own_rank()
 
-    self._jobs = collections.OrderedDict(sorted(jobs.items()))
+    if jobs is None:
+      jobs = {'worker': self._resolve_num_tasks()}
+
+    self._jobs = jobs
     self._port_base = port_base
 
-    # user specification overrides SLURM specification
-    if tasks_per_node is not None:
-      self._tasks_per_node = tasks_per_node
-    elif tasks_per_node is None and 'SLURM_NTASKS_PER_NODE' in os.environ:
-      self._tasks_per_node = int(os.environ['SLURM_NTASKS_PER_NODE'])
+    if tasks_per_node is None:
+      self._task_configuration = self._resolve_task_configuration()
+    elif isinstance(tasks_per_node, dict):
+      # User can pass in an explicit configuration as a dict
+      self._task_configuration = tasks_per_node
     else:
-      raise RuntimeError('Neither `tasks_per_node` or '
-                         'SLURM_NTASKS_PER_NODE is set.')
+      # User can pass a fixed number of tasks per node
+      hostlist = self._resolve_hostlist()
+      self._task_configuration = {
+          host: int(tasks_per_node) for host in hostlist
+      }
 
+    max_tasks_per_node = max(self._task_configuration.values())
+    num_tasks = sum(self._task_configuration.values())
+
+    if gpus_per_node is None:
+      gpus_per_node = get_num_gpus()
+    if gpus_per_task is None:
+      gpus_per_task = gpus_per_node // max_tasks_per_node
     self._gpus_per_node = gpus_per_node
     self._gpus_per_task = gpus_per_task
 
@@ -120,11 +268,39 @@ class SlurmClusterResolver(ClusterResolver):
     self._gpu_allocation = []
     self._cluster_allocation = {}
 
-    if self._tasks_per_node * self._gpus_per_task > self._gpus_per_node:
+    if max_tasks_per_node * self._gpus_per_task > self._gpus_per_node:
       raise RuntimeError('Requested more GPUs per node then available.')
 
     if sum(self._jobs.values()) != num_tasks:
-      raise RuntimeError('Requested more tasks then assigned tasks.')
+      raise RuntimeError('Requested {} tasks but only {} were assigned.'.format(
+          sum(self._jobs.values()), num_tasks))
+
+  def _resolve_own_rank(self):
+    """Return the rank of the current task in range [0, num_tasks)"""
+    return int(_get_slurm_var('PROCID'))
+
+  def _resolve_num_tasks(self):
+    """Return the number of tasks for the current job step"""
+    return get_num_slurm_tasks()
+
+  def _resolve_hostlist(self):
+    """Return a list of hostnames for nodes running the current job step"""
+    return expand_hostlist(_get_slurm_var('STEP_NODELIST'))
+
+  def _resolve_task_configuration(self):
+    """Create a mapping of hostnames to the number of tasks allocated on it
+
+    Reads the SLURM environment to determine the nodes involved in the current
+    job step and number of tasks running on each node.
+
+    Returns a dictionary mapping each hostname to the number of tasks.
+    """
+    hostlist = self._resolve_hostlist()
+    tasks_per_node = expand_tasks_per_node(
+        _get_slurm_var('STEP_TASKS_PER_NODE'))
+    return {
+        host: num_tasks for (host, num_tasks) in zip(hostlist, tasks_per_node)
+    }
 
   def cluster_spec(self):
     """Returns a ClusterSpec object based on the latest instance group info.
@@ -141,16 +317,15 @@ class SlurmClusterResolver(ClusterResolver):
       A ClusterSpec containing host information retrieved from Slurm's
         environment variables.
     """
-    hostlist = self._resolve_hostnames()
 
     task_list = []
     self._gpu_allocation = []
     self._cluster_allocation = {}
 
-    for host in hostlist:
+    # Sort to make sure the order is the same for each run
+    for host, num_tasks in sorted(self._task_configuration.items()):
       for port_offset, gpu_offset in zip(
-          range(self._tasks_per_node),
-          range(0, self._gpus_per_node, self._gpus_per_task)):
+          range(num_tasks), range(0, self._gpus_per_node, self._gpus_per_task)):
 
         host_addr = '%s:%d' % (host, self._port_base + port_offset)
         task_list.append(host_addr)
@@ -164,7 +339,8 @@ class SlurmClusterResolver(ClusterResolver):
     cluster_rank_offset_start = 0
     cluster_rank_offset_end = 0
 
-    for task_type, num_tasks in self._jobs.items():
+    # Sort to make sure the order is the same for each run
+    for task_type, num_tasks in sorted(self._jobs.items()):
       cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
 
       self._cluster_allocation[task_type] = (
@@ -223,4 +399,4 @@ class SlurmClusterResolver(ClusterResolver):
                        config_proto=None):
     # Unused, since this is set in __init__ manually.
     del task_type, task_id, config_proto
-    return {'GPU': self._gpus_per_node}
+    return {'GPU': self._gpus_per_task}
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index c641fe60853..07f9e81994a 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.distribute.cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_hostlist, expand_tasks_per_node
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -29,8 +30,23 @@ mock = test.mock
 
 class SlurmClusterResolverTest(test.TestCase):
 
-  def mock_resolve_hostnames_output(self):
-    return ['t02n13', 't02n41', 't02n43', 't02n44']
+  def test_expand_hostlist(self):
+    self.assertEqual(expand_hostlist('n1'), ['n1'])
+    self.assertEqual(expand_hostlist('n[1,3]'), ['n1', 'n3'])
+    self.assertEqual(expand_hostlist('n[1-3]'), ['n1', 'n2', 'n3'])
+    self.assertEqual(
+        expand_hostlist('n[1-2],m5,o[3-4,6,7-9]'),
+        ['n1', 'n2', 'm5', 'o3', 'o4', 'o6', 'o7', 'o8', 'o9'])
+
+  def test_expand_tasks_per_node(self):
+    self.assertEqual(expand_tasks_per_node('2'), [2])
+    self.assertEqual(expand_tasks_per_node('2,1,3'), [2, 1, 3])
+    self.assertEqual(expand_tasks_per_node('3(x2),2,1'), [3, 3, 2, 1])
+    self.assertEqual(
+        expand_tasks_per_node('3(x2),2,11(x4)'), [3, 3, 2, 11, 11, 11, 11])
+    self.assertEqual(
+        expand_tasks_per_node('13(x10)'),
+        [13, 13, 13, 13, 13, 13, 13, 13, 13, 13])
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
     self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
@@ -44,9 +60,36 @@ class SlurmClusterResolverTest(test.TestCase):
         expected_proto,
         server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
 
-  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
-  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
-                     mock_resolve_hostnames_output)
+  @mock.patch.dict(
+      os.environ, {
+          'SLURM_PROCID': '0',
+          'SLURM_STEP_NUM_TASKS': '3',
+          'SLURM_STEP_TASKS_PER_NODE': '1(x3)',
+          'SLURM_STEP_NODELIST': 't02n13,t02n41,t02n43',
+          'CUDA_VISIBLE_DEVICES': '0',
+      })
+  def testSimpleRetrievalFromEnv(self):
+    slurm_cluster_resolver = SlurmClusterResolver()
+
+    actual_cluster_spec = slurm_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 't02n13:8888' }
+                         tasks { key: 1 value: 't02n41:8888' }
+                         tasks { key: 2 value: 't02n43:8888' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(
+        slurm_cluster_resolver.master('worker', 0, rpc_layer='grpc'),
+        'grpc://t02n13:8888')
+    self.assertEqual(slurm_cluster_resolver.num_accelerators(), {'GPU': 1})
+    self.assertEqual(os.environ['CUDA_VISIBLE_DEVICES'], '0')
+
+  @mock.patch.dict(
+      os.environ, {
+          'SLURM_PROCID': '0',
+          'SLURM_STEP_NUM_TASKS': '3',
+          'SLURM_STEP_NODELIST': 't02n13,t02n41,t02n43',
+      })
   def testSimpleSuccessfulRetrieval(self):
     slurm_cluster_resolver = SlurmClusterResolver(
         jobs={
@@ -67,9 +110,12 @@ class SlurmClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
-  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
-  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
-                     mock_resolve_hostnames_output)
+  @mock.patch.dict(
+      os.environ, {
+          'SLURM_PROCID': '0',
+          'SLURM_STEP_NUM_TASKS': '3',
+          'SLURM_STEP_NODELIST': 't02n13,t02n41,t02n43',
+      })
   def testSimpleMasterRetrieval(self):
     slurm_cluster_resolver = SlurmClusterResolver(
         jobs={
@@ -92,13 +138,13 @@ class SlurmClusterResolverTest(test.TestCase):
         slurm_cluster_resolver.master('ps', 0, rpc_layer='test'),
         'test://t02n13:8888')
 
-  @mock.patch.dict(os.environ, {
-      'SLURM_PROCID': '0',
-      'SLURM_NTASKS': '3',
-      'SLURM_NTASKS_PER_NODE': '1'
-  })
-  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
-                     mock_resolve_hostnames_output)
+  @mock.patch.dict(
+      os.environ, {
+          'SLURM_PROCID': '0',
+          'SLURM_STEP_NUM_TASKS': '3',
+          'SLURM_STEP_TASKS_PER_NODE': '1(x3)',
+          'SLURM_STEP_NODELIST': 't02n13,t02n41,t02n43',
+      })
   def testTaskPerNodeNotSetRetrieval(self):
     slurm_cluster_resolver = SlurmClusterResolver(
         jobs={
@@ -121,12 +167,11 @@ class SlurmClusterResolverTest(test.TestCase):
   @mock.patch.dict(
       os.environ, {
           'SLURM_PROCID': '1',
-          'SLURM_NTASKS': '5',
-          'SLURM_NTASKS_PER_NODE': '2',
-          'CUDA_VISIBLE_DEVICES': ''
+          'SLURM_STEP_NUM_TASKS': '5',
+          'SLURM_STEP_TASKS_PER_NODE': '2(x2),1',
+          'SLURM_STEP_NODELIST': 't02n13,t02n41,t02n43',
+          'CUDA_VISIBLE_DEVICES': '',
       })
-  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
-                     mock_resolve_hostnames_output)
   def testMultiTaskPerNodeRetrieval(self):
     slurm_cluster_resolver = SlurmClusterResolver(
         jobs={
@@ -153,12 +198,11 @@ class SlurmClusterResolverTest(test.TestCase):
   @mock.patch.dict(
       os.environ, {
           'SLURM_PROCID': '1',
-          'SLURM_NTASKS': '5',
-          'SLURM_NTASKS_PER_NODE': '2',
-          'CUDA_VISIBLE_DEVICES': ''
+          'SLURM_STEP_NUM_TASKS': '5',
+          'SLURM_STEP_TASKS_PER_NODE': '2(x2),1',
+          'SLURM_STEP_NODELIST': 't02n13,t02n41,t02n43',
+          'CUDA_VISIBLE_DEVICES': '',
       })
-  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
-                     mock_resolve_hostnames_output)
   def testMultipleGpusPerTaskRetrieval(self):
     slurm_cluster_resolver = SlurmClusterResolver(
         jobs={
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index 3658a6bcaa9..305af265b03 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -88,7 +88,7 @@ class TFConfigClusterResolver(ClusterResolver):
 
   @property
   def task_id(self):
-    if self._task_type is None:
+    if self._task_id is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
       return int(task_info['index']) if 'index' in task_info else None
     else:
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index c239d60a224..2989e24c284 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -224,6 +224,21 @@ class TFConfigClusterResolverTest(test.TestCase):
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual(1, cluster_resolver.task_id)
 
+  def testTaskIndexOverride(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "worker": ["worker0:2222", "worker1:2222"]
+      },
+      "task": {
+        "type": "worker",
+        "index": "0"
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver(task_id=1)
+    self.assertEqual(1, cluster_resolver.task_id)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index ba8f7542712..2f2a105df84 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import device_lib
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -34,7 +35,6 @@ from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -63,8 +63,8 @@ def validate_destinations(destinations):
   if not isinstance(
       destinations,
       (value_lib.DistributedValues, ops.Tensor, value_lib.AggregatingVariable,
-       six.string_types, value_lib.TPUMirroredVariable)
-  ) and not resource_variable_ops.is_resource_variable(destinations):
+       six.string_types, tpu_values.TPUMirroredVariable
+      )) and not resource_variable_ops.is_resource_variable(destinations):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
                      " a tf.Variable object, or a device string.")
 
@@ -1074,7 +1074,7 @@ class CollectiveAllReduce(CrossDeviceOps):
                 reduced_gv_list):
               control_input_grads = [g for g, _ in reduced_gv_list[-1]]
             else:
-              control_input_grads = []
+              control_input_grads = None
             collective_reduced = cross_device_utils.build_collective_reduce(
                 grads, self._num_workers, self._collective_keys, "Add", "Id",
                 communication_hint, control_input_grads)
@@ -1123,87 +1123,20 @@ class CollectiveAllReduce(CrossDeviceOps):
       # optimizer and packed into a single all-reduce.
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
-          # `grad_and_vars` contains gradients for the same variable but from
-          # different devices. Because current CollectiveAllGather
-          # implementations require input IndexedSlices to have consistent
-          # length across the board, we handle the reduction of IndexedSlices
-          # as follows:
-          #   1. Gather the lengths of IndexedSlices from all participants.
-          #   2. If they have consistent length, apply all_gather.
-          #   3. Otherwise convert IndexedSlices to dense tensors and apply
-          #      all_reduce.
+          grads = [g for g, _ in grad_and_vars]
 
-          def all_gather():
-            """Use all_gather to aggregate `IndexedSlices`."""
-            grads = [g for g, _ in grad_and_vars]  # pylint: disable=cell-var-from-loop
-            values = [g.values for g in grads]
-            indices = [g.indices for g in grads]
-
-            # Build two separate allgathers, one for values, the other one for
-            # indices.
-            gathered_values = cross_device_utils.build_collective_gather(
-                values, self._num_workers, self._collective_keys)
-            gathered_indices = cross_device_utils.build_collective_gather(
-                indices, self._num_workers, self._collective_keys)
-            assert len(gathered_values) == len(gathered_indices)
-
-            gathered_grads = []
-            for i in range(len(values)):
-              gathered_grad = ops.IndexedSlices(
-                  values=gathered_values[i],
-                  indices=gathered_indices[i],
-                  dense_shape=grads[i].dense_shape)
-              gathered_grads.append(gathered_grad)
-            return gathered_grads
-
-          def all_reduce():
-            """Use all_reduce to aggregate `IndexedSlices`."""
-            grads = []
-            for g, _ in grad_and_vars:  # pylint: disable=cell-var-from-loop
-              with ops.device(g.device):
-                grads.append(ops.convert_to_tensor(g))
-
-            reduced_dense_grads = cross_device_utils.build_collective_reduce(
-                grads, self._num_workers, self._collective_keys, "Add", "Id",
-                communication_hint)
-            # We have to convert dense grad to IndexedSlice because all_reduce()
-            # and all_gather() must have the same return type as required by
-            # control_flow_ops.cond.
-            reduced_grads = []
-            for grad in reduced_dense_grads:
-              reduced_grads.append(
-                  ops.IndexedSlices(
-                      values=grad,
-                      indices=math_ops.range(array_ops.shape(grad)[0]),
-                      dense_shape=array_ops.shape(grad)))
-            return reduced_grads
-
-          indexed_slice_lengths = []
-          for g, _ in grad_and_vars:
-            with ops.device(g.device):
-              indexed_slice_lengths.append(array_ops.shape(g.indices))
-          gathered_indexed_slice_lengths = (
-              cross_device_utils.build_collective_gather(
-                  indexed_slice_lengths, self._num_workers,
-                  self._collective_keys))
-          # gathered_indexed_slice_lengths takes the following forms:
-          # [[length1_on_gpu_0, length2_on_gpu0, ...],
-          #  [length1_on_gpu_1, length2_on_gpu1, ...]
-          #  ...
-          # ]
-          # Each sublist is value-wise identical but resides on different
-          # devices. Since each sublist has the same value, we can just use the
-          # first sublist to compute the condition.
-          collective_reduced = control_flow_ops.cond(
-              math_ops.equal(
-                  math_ops.reduce_max(gathered_indexed_slice_lengths[0]),
-                  math_ops.reduce_min(gathered_indexed_slice_lengths[0])),
-              all_gather, all_reduce)
-          # tf.cond implicitly unpacks singleton list to single value, hence
-          # we need to re-wrap the single value into a singleton list here.
-          if not isinstance(collective_reduced, list):
-            collective_reduced = [collective_reduced]
+          # Add control dependencies per device from the last gradients to the
+          # current set, in order to serialize NCCL launches.
+          if (communication_hint == CollectiveCommunication.NCCL.value and
+              reduced_gv_list):
+            control_input_grads = [g for g, _ in reduced_gv_list[-1]]
+          else:
+            control_input_grads = None
 
+          collective_reduced = (
+              cross_device_utils.build_collective_gather_indexed_slices(
+                  grads, self._num_workers, self._collective_keys,
+                  communication_hint, control_input_grads))
           result = []
           for (_, v), g in zip(grad_and_vars, collective_reduced):
             result.append([g, v])
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index c91ec38bfd1..3a9d7b7ec44 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -301,6 +301,11 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
   def testReductionAndBroadcast(self, cross_device_ops, devices):
+    if isinstance(
+        cross_device_ops._obj,  # pylint: disable=protected-access
+        cross_device_ops_lib.AllReduceCrossDeviceOps
+    ) and context.executing_eagerly():
+      self.skipTest("b/149881884")
     self._testReductionAndBroadcast(cross_device_ops, devices)
 
   def testChooseAlgorithm(self):
@@ -432,6 +437,8 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
 
 NUM_WORKERS = 3
 
+CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+
 
 class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                               CrossDeviceOpsTestBase):
@@ -454,6 +461,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                         task_type,
                         task_id,
                         num_gpus=0,
+                        communication=CollectiveCommunication.AUTO,
                         use_strategy_object=False,
                         local_mode=False,
                         num_packs=1):
@@ -469,15 +477,23 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         devices = ["/device:CPU:0"]
 
       if use_strategy_object:
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            communication=communication)
         strategy.extended._collective_keys = collective_keys
         strategy.extended._cross_device_ops._collective_keys = collective_keys
         return strategy, devices, ""
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            1, num_gpus, collective_keys=collective_keys, num_packs=num_packs)
+            1,
+            num_gpus,
+            collective_keys=collective_keys,
+            num_packs=num_packs,
+            communication=communication)
         return collective_all_reduce_ops, devices, ""
     else:
+      # NCCL requires physical GPUs for every replica, which we can't do with
+      # simulated multi host set up now.
+      assert communication != CollectiveCommunication.NCCL
       if num_gpus:
         devices = [
             "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
@@ -489,7 +505,8 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         ]
 
       if use_strategy_object:
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            communication=communication)
         strategy.configure(
             cluster_spec=self._cluster_spec,
             task_type=task_type,
@@ -500,8 +517,11 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                 "grpc://" + self._cluster_spec[task_type][task_id])
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            NUM_WORKERS, num_gpus, collective_keys=collective_keys,
-            num_packs=num_packs)
+            NUM_WORKERS,
+            num_gpus,
+            collective_keys=collective_keys,
+            num_packs=num_packs,
+            communication=communication)
         return (collective_all_reduce_ops, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
@@ -509,6 +529,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                       task_type,
                       task_id,
                       num_gpus,
+                      communication,
                       use_strategy_object=False,
                       local_mode=False,
                       num_packs=1):
@@ -516,6 +537,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         task_type,
         task_id,
         num_gpus,
+        communication=communication,
         use_strategy_object=use_strategy_object,
         local_mode=local_mode,
         num_packs=num_packs)
@@ -645,11 +667,16 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                                   task_type,
                                   task_id,
                                   num_gpus,
+                                  communication,
                                   batch_reduce,
                                   variable_length,
                                   local_mode=False):
     collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type, task_id, num_gpus, local_mode=local_mode)
+        task_type,
+        task_id,
+        num_gpus,
+        communication=communication,
+        local_mode=local_mode)
     if local_mode:
       num_workers = 1
       worker_device = None
@@ -704,6 +731,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         self._test_reduction,
         self._cluster_spec,
         required_gpus,
+        communication=CollectiveCommunication.RING,
         use_strategy_object=use_strategy_object,
         num_packs=num_packs)
 
@@ -711,25 +739,32 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
       combinations.combine(
           mode=["graph"],
           required_gpus=[0, 1, 2],
-          batch_reduce=[True],
           variable_length=[True, False]))
-  def testReduceIndexedSlicesDistributed(self, required_gpus, batch_reduce,
-                                         variable_length):
-    self._run_between_graph_clients(self._test_reduce_indexed_slices,
-                                    self._cluster_spec, required_gpus,
-                                    batch_reduce, variable_length)
+  def testReduceIndexedSlicesDistributed(self, required_gpus, variable_length):
+    self._run_between_graph_clients(
+        self._test_reduce_indexed_slices,
+        self._cluster_spec,
+        required_gpus,
+        communication=CollectiveCommunication.RING,
+        batch_reduce=True,
+        variable_length=variable_length)
 
   # Collective ops doesn't support strategy with one device.
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
           required_gpus=2,
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ],
           use_strategy_object=[True, False]))
-  def testReductionLocal(self, required_gpus, use_strategy_object):
+  def testReductionLocal(self, required_gpus, communication,
+                         use_strategy_object):
     self._test_reduction(
         None,
         None,
         required_gpus,
+        communication=communication,
         use_strategy_object=use_strategy_object,
         local_mode=True)
 
@@ -738,15 +773,19 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
           mode=["graph"],
           required_gpus=2,
           batch_reduce=[True, False],
-          variable_length=[True, False]))
+          variable_length=[True, False],
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
   def testReduceIndexedSlicesLocal(self, required_gpus, batch_reduce,
-                                   variable_length):
+                                   variable_length, communication):
     self._test_reduce_indexed_slices(
         None,
         None,
         required_gpus,
-        batch_reduce,
-        variable_length,
+        communication=communication,
+        batch_reduce=batch_reduce,
+        variable_length=variable_length,
         local_mode=True)
 
 
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 3afb8b55b24..fa6f612af17 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -25,12 +25,12 @@ from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 
@@ -304,6 +304,19 @@ class CollectiveKeys(object):
         self._group_key_table[key_id] = new_key
     return self._group_key_table[key_id]
 
+  def get_group_key_of_tensors(self, tensors):
+    """Returns a group key for set of tensors.
+
+    Args:
+      tensors: list of `Tensor`s in a collective group. Each tensor must be on a
+        different device.
+
+    Returns:
+      int key uniquely identifying the set of devices of these tensors.
+    """
+    devices = [t.device for t in tensors]
+    return self.get_group_key(devices)
+
   def get_op_instance_key(self):
     """Returns a new instance key for use in defining a collective op."""
     v = self._get_thread_local_object().op_instance_key
@@ -322,10 +335,12 @@ def build_collective_reduce(input_tensors,
                             collective_keys,
                             reduction_op='Add',
                             unary_op='Id',
-                            communication_hint='auto',
+                            communication_hint='AUTO',
                             control_inputs=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
+  This method must be called in graph mode or inside a tf.function.
+
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
       together; must be one per device.
@@ -346,37 +361,40 @@ def build_collective_reduce(input_tensors,
   Raises:
     ValueError: There must be at least two tensors over all the workers.
   """
+  assert not context.executing_eagerly(), (
+      'build_collective_reduce can only be called in graph mode or inside '
+      'tf.function')
+
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_keys.get_group_key(devices)
+  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
   instance_key = collective_keys.get_op_instance_key()
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-  if control_inputs:
-    assert len(control_inputs) == len(input_tensors)
 
   out_tensors = []
-  for dev_idx in range(num_devices):
-    with ops.device(devices[dev_idx]):
-      if control_inputs:
-        assert control_inputs[dev_idx].device == input_tensors[dev_idx].device
-        with ops.control_dependencies([control_inputs[dev_idx]]):
-          reduce_op = collective_ops.all_reduce(
-              input_tensors[dev_idx], group_size, group_key, instance_key,
-              reduction_op, unary_op, subdiv_offsets, communication_hint)
-      else:
-        reduce_op = collective_ops.all_reduce(
-            input_tensors[dev_idx], group_size, group_key, instance_key,
-            reduction_op, unary_op, subdiv_offsets, communication_hint)
-      out_tensors.append(reduce_op)
+  for idx, input_tensor in enumerate(input_tensors):
+    with ops.device(input_tensor.device):
+      with ops.control_dependencies(
+          _control_input(input_tensors, control_inputs, idx)):
+        out_tensor = collective_ops.all_reduce(input_tensor, group_size,
+                                               group_key, instance_key,
+                                               reduction_op, unary_op,
+                                               subdiv_offsets,
+                                               communication_hint)
+      out_tensors.append(out_tensor)
   return out_tensors
 
 
-def build_collective_gather(input_tensors, num_workers, collective_keys):
+def build_collective_gather(input_tensors,
+                            num_workers,
+                            collective_keys,
+                            communication_hint='AUTO',
+                            control_inputs=None):
   """Build a subgraph that does one full all-gather, using the collective Op.
 
+  This method must be called in graph mode or inside a tf.function.
+
   Args:
     input_tensors: tensors within a single worker graph that are to be gathered
       together; must be one per device.
@@ -384,37 +402,136 @@ def build_collective_gather(input_tensors, num_workers, collective_keys):
       will be doing this same reduction.  The reduction will actually include
       the corresponding tensors at all these workers.
     collective_keys: a CollectiveKeys object.
+    communication_hint: string providing hint to runtime for choosing collective
+      implementation.
+    control_inputs: if not None, add control edges between control_inputs and
+      (index-wise) corresponding collective_gather tensors
 
   Returns:
     An array of final tensors, one per device, computed by the full gather.
-
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
   """
+  assert not context.executing_eagerly(), (
+      'build_collective_gather can only be called in graph mode or inside '
+      'tf.function')
+
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_keys.get_group_key(devices)
+  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
   instance_key = collective_keys.get_op_instance_key()
 
-  def collective_all_gather():
-    """Call collective allgather."""
-    assert not context.executing_eagerly()
-    out_tensors = []
-    for d in range(num_devices):
-      with ops.device(devices[d]):
-        gather_op = collective_ops.all_gather(input_tensors[d], group_size,
-                                              group_key, instance_key)
-        out_tensors.append(gather_op)
-    return out_tensors
+  out_tensors = []
+  for idx, input_tensor in enumerate(input_tensors):
+    with ops.device(input_tensor.device):
+      with ops.control_dependencies(
+          _control_input(input_tensors, control_inputs, idx)):
+        out_tensor = collective_ops.all_gather(input_tensor, group_size,
+                                               group_key, instance_key,
+                                               communication_hint)
+      out_tensors.append(out_tensor)
+  return out_tensors
 
-  if context.executing_eagerly():
-    # Collective ops will block unless they are executed concurrently such as in
-    # a graph or a defun.
-    collective_all_gather = def_function.function(collective_all_gather)
-  return collective_all_gather()
+
+def build_collective_gather_indexed_slices(input_slices_list,
+                                           num_workers,
+                                           collective_keys,
+                                           communication_hint='AUTO',
+                                           control_inputs=None):
+  """Build a subgraph that all-gathers IndexedSlices using the collective Op.
+
+  This method must be called in graph mode or inside a tf.function.
+
+  Args:
+    input_slices_list: a list of IndexedSlices within a single worker graph that
+      are to be gathered together; must be one per device.
+    num_workers: total number of workers with identical independent graphs that
+      will be doing this same reduction.  The reduction will actually include
+      the corresponding tensors at all these workers.
+    collective_keys: a CollectiveKeys object.
+    communication_hint: string providing hint to runtime for choosing collective
+      implementation.
+    control_inputs: if not None, add control edges between control_inputs and
+      (index-wise) corresponding collective_reduce tensors
+
+  Returns:
+    An array of final IndexedSlices, one per device, computed by the full
+    gather.
+
+  Raises:
+    ValueError: if control_inputs is not None and doesn't match the length and
+      devices of inputs.
+  """
+  assert not context.executing_eagerly(), (
+      'build_collective_gather_indexed_slices can only be called in graph mode'
+      ' or inside tf.function')
+
+  group_size = len(input_slices_list) * num_workers
+  if group_size < 2:
+    return input_slices_list
+
+  group_key = collective_keys.get_group_key_of_tensors(input_slices_list)
+  gather_length_key = collective_keys.get_op_instance_key()
+  gather_indices_key = collective_keys.get_op_instance_key()
+  gather_values_key = collective_keys.get_op_instance_key()
+  reduce_densified_key = collective_keys.get_op_instance_key()
+
+  # Current CollectiveAllGather implementations require input IndexedSlices to
+  # have consistent length across the board, we handle the reduction of
+  # IndexedSlices as follows:
+  #   1. Gather the lengths of IndexedSlices from all participants.
+  #   2. If they have consistent length, apply all_gather.
+  #   3. Otherwise convert IndexedSlices to dense tensors and apply
+  #      all_reduce.
+  out_slices_list = []
+  for idx, input_slices in enumerate(input_slices_list):
+    # pylint: disable = cell-var-from-loop
+    with ops.device(input_slices.device):
+
+      def all_gather():
+        """Use all_gather to aggregate `IndexedSlices`."""
+        all_values = collective_ops.all_gather(input_slices.values, group_size,
+                                               group_key, gather_values_key,
+                                               communication_hint)
+        # Add control dependency to order the all-gather.
+        control = [all_values] if communication_hint == 'NCCL' else []
+        with ops.control_dependencies(control):
+          all_indices = collective_ops.all_gather(input_slices.indices,
+                                                  group_size, group_key,
+                                                  gather_indices_key,
+                                                  communication_hint)
+        return ops.IndexedSlices(
+            values=all_values,
+            indices=all_indices,
+            dense_shape=input_slices.dense_shape)
+
+      def densify_and_all_reduce():
+        """Use all_reduce to aggregate `IndexedSlices`."""
+        densified = ops.convert_to_tensor(input_slices)
+        reduced = collective_ops.all_reduce(densified, group_size, group_key,
+                                            reduce_densified_key, 'Add', 'Id',
+                                            [0], communication_hint)
+        # We have to convert dense grad to IndexedSlice because all_reduce()
+        # and all_gather() must have the same return type as required by
+        # control_flow_ops.cond.
+        return ops.IndexedSlices(
+            values=reduced,
+            indices=math_ops.range(array_ops.shape(reduced)[0]),
+            dense_shape=input_slices.dense_shape)
+
+      length = array_ops.shape(input_slices.indices)
+      with ops.control_dependencies(
+          _control_input(input_slices, control_inputs, idx)):
+        all_lengths = collective_ops.all_gather(length, group_size, group_key,
+                                                gather_length_key,
+                                                communication_hint)
+      out_slices = control_flow_ops.cond(
+          math_ops.equal(
+              math_ops.reduce_max(all_lengths),
+              math_ops.reduce_min(all_lengths)), all_gather,
+          densify_and_all_reduce)
+      out_slices_list.append(out_slices)
+    # pylint: enable=cell-var-from-loop
+  return out_slices_list
 
 
 def sum_grad_and_var_all_reduce(grad_and_vars,
@@ -777,3 +894,31 @@ def stitch_values(values_and_indices_list):
         assert result[i] is None
         result[i] = v
   return result
+
+
+def _control_input(inputs, control_inputs, idx):
+  """Returns the `idx`-th item in control_inputs to be used in ops.control_dependencies.
+
+  This is a helper function for building collective ops.  The function checks
+  that the devices of control_inputs and inputs match.
+
+  Args:
+    inputs: a list of `Tensor`s
+    control_inputs: a list or None.
+    idx: the index into `inputs` and `control_inputs`.
+
+  Returns:
+    A one item list of the `idx`-th element of `control_inputs`, or an empty
+    list if `control_inputs` is None.
+  """
+  if control_inputs is None:
+    return []
+  if len(control_inputs) != len(inputs):
+    raise ValueError(
+        'control_inputs must match the length of the inputs, %s != %s' %
+        (len(control_inputs), len(inputs)))
+  if control_inputs[idx].device != inputs[idx].device:
+    raise ValueError(
+        'control_inputs must match the device of the inputs, %s != %s' %
+        (control_inputs[idx].device, inputs[idx].device))
+  return [control_inputs[idx]]
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 7d3f99f56d5..c1554d0b098 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -616,6 +617,70 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       results.append(output)
     self.assert_equal_flattened([[25., 36.], [49., 64.]], results)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies,
+          mode=["eager"]
+      ))
+  def testMultiDeviceDataCapturedFunction(self, distribution):
+    inputs = constant_op.constant([2., 3.])
+    dataset = lambda _: dataset_ops.Dataset.from_tensor_slices(inputs).repeat(5)
+    input_iterator = iter(
+        distribution.experimental_distribute_datasets_from_function(dataset))
+    with distribution.scope():
+      var = variables.Variable(1.0)
+
+    @def_function.function
+    def train_step(input_iterator):
+
+      def func(inputs):
+        return math_ops.square(inputs) + var
+
+      per_replica_outputs = distribution.experimental_run_v2(
+          func, (next(input_iterator),))
+      mean = distribution.reduce(
+          reduce_util.ReduceOp.MEAN, per_replica_outputs, axis=None)
+      for _ in dataset_ops.Dataset.range(1):
+        per_replica_outputs = distribution.experimental_run_v2(
+            func, (next(input_iterator),))
+        mean = distribution.reduce(
+            reduce_util.ReduceOp.MEAN, per_replica_outputs, axis=None)
+      return mean
+
+    with distribution.scope():
+      if distribution.num_replicas_in_sync == 1:
+        self.assertAlmostEqual(10.0, self.evaluate(train_step(input_iterator)))
+      else:
+        self.assertAlmostEqual(7.5, self.evaluate(train_step(input_iterator)))
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies,
+          mode=["eager"]
+      ))
+  def testDatasetOutOfRange(self, distribution):
+    with distribution.scope():
+      a = variables.Variable(
+          0.0, aggregation=variables.VariableAggregation.SUM)
+
+    def train_step(val):
+      a.assign_add(math_ops.reduce_sum(val))
+
+    @def_function.function
+    def f_train_step(iterator):
+      distribution.experimental_run_v2(train_step, args=(next(iterator),))
+      return a
+
+    dataset = get_dataset_from_tensor_slices([5., 6., 7., 8.]).batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+
+    iterator = iter(dist_dataset)
+    with self.assertRaises(errors.OutOfRangeError):
+      for _ in range(100):
+        f_train_step(iterator)
+
+    self.assertAlmostEqual(26.0, a.numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 7c090399794..7fac3fa3d9e 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=line-too-long
 """Library for running a computation across multiple devices.
 
 See the guide for overview and examples:
 [TensorFlow v2.x](https://www.tensorflow.org/guide/distributed_training),
-[TensorFlow v1.x](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/distribute_strategy.ipynb).  # pylint: disable=line-too-long
+[TensorFlow v1.x](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/distribute_strategy.ipynb).
 
 The intent of this library is that you can write an algorithm in a stylized way
 and it will be usable with a variety of different `tf.distribute.Strategy`
@@ -90,6 +91,7 @@ Note that we provide a default version of `tf.distribute.Strategy` that is
 used when no other strategy is in scope, that provides the same API with
 reasonable default behavior.
 """
+# pylint: enable=line-too-long
 
 from __future__ import absolute_import
 from __future__ import division
@@ -424,7 +426,7 @@ class RunOptions(
 
   Attributes:
     experimental_enable_dynamic_batch_size: Boolean. Only applies to
-      TPUStrategy. Default to False. If True, TPUStrategy will enable dynamic
+      TPUStrategy. Default to True. If True, TPUStrategy will enable dynamic
       padder to support dynamic batch size for the inputs. Otherwise only static
       shape inputs are allowed.
     experimental_bucketizing_dynamic_shape: Boolean. Only applies to
diff --git a/tensorflow/python/distribute/distributed_file_utils.py b/tensorflow/python/distribute/distributed_file_utils.py
index 9e4a2b202f1..b8e3fd8a8c9 100644
--- a/tensorflow/python/distribute/distributed_file_utils.py
+++ b/tensorflow/python/distribute/distributed_file_utils.py
@@ -50,7 +50,6 @@ from __future__ import print_function
 
 import os
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.lib.io import file_io
 
 
@@ -80,7 +79,7 @@ def write_dirpath(dirpath, strategy=None):
     return dirpath
   if not strategy.extended._in_multi_worker_mode():  # pylint: disable=protected-access
     return dirpath
-  if multi_worker_util.is_chief():
+  if strategy.extended.should_checkpoint:
     return dirpath
   # If this worker is not chief and hence should not save file, save it to a
   # temporary directory to be removed later.
@@ -96,8 +95,10 @@ def remove_temp_dirpath(dirpath, strategy=None):
     # If strategy is still not available, this is not in distributed training.
     # Fallback to no-op.
     return
-  if strategy.extended._in_multi_worker_mode():  # pylint: disable=protected-access
-    if not multi_worker_util.is_chief():
+  # TODO(anjalisridhar): Consider removing the check for multi worker mode since
+  # it is redundant when used with the should_checkpoint property.
+  if (strategy.extended._in_multi_worker_mode() and  # pylint: disable=protected-access
+      not strategy.extended.should_checkpoint):
       # If this worker is not chief and hence should not save file, remove
       # the temporary directory.
-      file_io.delete_recursively(_get_temp_dir(dirpath, strategy))
+    file_io.delete_recursively(_get_temp_dir(dirpath, strategy))
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
index 24d6a67187f..29593d65c5d 100644
--- a/tensorflow/python/distribute/distribution_strategy_context.py
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import threading
 
 from tensorflow.python.framework import ops
@@ -266,6 +267,20 @@ def experimental_set_strategy(strategy):
     ops.get_default_graph()._global_distribute_strategy_scope = new_scope  # pylint: disable=protected-access
 
 
+# ------------------------------------------------------------------------------
+# Internal helpers.
+
+
+@contextlib.contextmanager
+def enter_or_assert_strategy(strategy):
+  if not has_strategy():
+    with strategy.scope():
+      yield
+  else:
+    _assert_strategy(strategy)
+    yield
+
+
 # ------------------------------------------------------------------------------
 # Defaults that are used when no tf.distribute.Strategy is explicitly created.
 # We create them lazily in a function so that we can workaround the circular
@@ -284,6 +299,17 @@ _default_replica_context_lock = threading.Lock()
 _default_replica_mode_lock = threading.Lock()
 
 
+def _assert_strategy(strategy):
+  if not has_strategy():
+    raise RuntimeError('Need to be inside "with strategy.scope()" for %s' %
+                       (strategy,))
+  current_strategy = get_strategy()
+  if current_strategy is not strategy:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (current_strategy, strategy))
+
+
 def _get_default_strategy():
   if _defaults["strategy"] is None:
     # Avoid race condition causing two defaults to be created
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index d60d489c516..0ab4018ce13 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
@@ -1340,34 +1339,36 @@ class MirroredVariableStopGradientTest(test.TestCase, parameterized.TestCase):
       self.assertIsNone(grads[0])
 
 
-class FunctionTest(test.TestCase):
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        mode=["eager"]))
+class FunctionTest(test.TestCase, parameterized.TestCase):
 
-  def testBackwardFunctionDevicePlacement(self):
-    if context.num_gpus() < 1:
-      self.skipTest("At least one GPU is required.")
-    devices = [device_util.resolve("/device:GPU:0"),
-               device_util.resolve("/device:CPU:0")]
-    ms = mirrored_strategy.MirroredStrategy(devices)
-
-    with ms.scope():
+  def testBackwardFunctionDevicePlacement(self, distribution):
+    with distribution.scope():
       w = variable_scope.variable([1.5], name="w")
       b = variable_scope.variable([0.5], name="b")
 
     @def_function.function
     def forward(x, w, b):
       return x * w + b
-    x = constant_op.constant([1.0], name="x_useless")
+
+    x = array_ops.identity([1.0], name="x_useless")
     concrete_forward = forward.get_concrete_function(x, w._primary, b._primary)
 
-    with ms.scope():
+    with distribution.scope():
+
       def replica_fn():
         with backprop.GradientTape() as t:
-          x = constant_op.constant([1.0], name="x")
+          x = array_ops.identity([1.0], name="x")
           loss = concrete_forward(x, w._get(), b._get()) - [1.0]
           return t.gradient(loss, [w, b])
 
       def step_fn():
-        return ms.experimental_run_v2(replica_fn)
+        return distribution.experimental_run_v2(replica_fn)
 
       context.enable_run_metadata()
       g1, g2 = step_fn()
@@ -1383,30 +1384,32 @@ class FunctionTest(test.TestCase):
         for node in partition_graph.node:
           if node.name == node_name:
             devices_for_this_node.add(node.device)
+      devices = [device_util.resolve("/device:GPU:0"),
+                 device_util.resolve("/device:CPU:0")]
       self.assertSetEqual(devices_for_this_node, set(devices))
 
-  def testFuctionPreservesAutoGraph(self):
-    config.set_logical_device_configuration(
-        config.list_physical_devices("CPU")[0],
-        [context.LogicalDeviceConfiguration()] * 2)
-    ms = mirrored_strategy.MirroredStrategy()
-
+  def testFuctionPreservesAutoGraph(self, distribution):
     def f():
       self.assertTrue(converter_testing.is_inside_generated_code())
       return 1
 
-    with ms.scope():
+    with distribution.scope():
+
       @def_function.function
       def replica_fn():
         return f()
 
-      ms.experimental_run_v2(replica_fn)
+      distribution.experimental_run_v2(replica_fn)
 
 
 def _replica_id():
   replica_id = ds_context.get_replica_context().replica_id_in_sync_group
   if not isinstance(replica_id, ops.Tensor):
     replica_id = constant_op.constant(replica_id)
+  # TODO(b/149852830): Workaround for small Tensor caching (which is only on
+  # CPU) to ensure the value is on the correct device.
+  replica_id = math_ops.cast(replica_id, dtypes.float32)
+  replica_id = math_ops.cast(replica_id, dtypes.int32)
   return replica_id
 
 
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index c889484ae68..00730959d4e 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -688,9 +688,9 @@ class RemoteSingleWorkerMirroredStrategyBase(DistributionTestBase):
 
   def _testDeviceScope(self, distribution):
     with distribution.scope():
-      a = constant_op.constant(1.)
+      a = array_ops.identity(1.)
       with ops.device("/cpu:0"):
-        b = constant_op.constant(1.)
+        b = array_ops.identity(1.)
       if context.executing_eagerly():
         device = "/job:worker/replica:0/task:0/device:CPU:0"
       else:
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 54e2028ccaf..0ebb2918bb1 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -34,6 +34,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
@@ -543,7 +544,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
 
     self._logical_device_stack.append(logical_device_id)
     try:
-      if values._enclosing_tpu_context() is None:  # pylint: disable=protected-access
+      if tpu_values.enclosing_tpu_context() is None:
         yield
       else:
         with ops.device(tpu.core(logical_device_id)):
@@ -648,20 +649,20 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
             v = next_creator(**kwargs)
 
-          assert not isinstance(v, values.TPUMirroredVariable)
+          assert not isinstance(v, tpu_values.TPUMirroredVariable)
           value_list.append(v)
       return value_list
 
     return values.create_mirrored_variable(self._container_strategy(),
                                            _real_mirrored_creator,
-                                           values.TPUMirroredVariable,
-                                           values.TPUSyncOnReadVariable,
+                                           tpu_values.TPUMirroredVariable,
+                                           tpu_values.TPUSyncOnReadVariable,
                                            **kwargs)
 
   def _reduce_to(self, reduce_op, value, destinations):
     if (isinstance(value, values.DistributedValues) or
         tensor_util.is_tensor(value)
-       ) and values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+       ) and tpu_values.enclosing_tpu_context() is not None:
       if reduce_op == reduce_util.ReduceOp.MEAN:
         # TODO(jhseu):  Revisit once we support model-parallelism.
         value *= (1. / self._num_replicas_in_sync)
@@ -701,9 +702,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     return output
 
   def _update(self, var, fn, args, kwargs, group):
-    assert isinstance(var, values.TPUVariableMixin) or isinstance(
+    assert isinstance(var, tpu_values.TPUVariableMixin) or isinstance(
         var, resource_variable_ops.BaseResourceVariable)
-    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+    if tpu_values.enclosing_tpu_context() is not None:
       if group:
         return fn(var, *args, **kwargs)
       else:
@@ -724,7 +725,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     return values.update_regroup(self, updates, group)
 
   def read_var(self, var):
-    assert isinstance(var, values.TPUVariableMixin) or isinstance(
+    assert isinstance(var, tpu_values.TPUVariableMixin) or isinstance(
         var, resource_variable_ops.BaseResourceVariable)
     return var.read_value()
 
@@ -745,7 +746,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     # since the `1` gets broadcast as an int32 but global_step is int64.
     if isinstance(tensor, (float, int)):
       return tensor
-    if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
+    if tpu_values.enclosing_tpu_context() is not None:
       broadcast_tensor = [tensor for _ in range(self._num_replicas_in_sync)]
       result = tpu_ops.all_to_all(
           broadcast_tensor,
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
new file mode 100644
index 00000000000..871c85405e2
--- /dev/null
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -0,0 +1,245 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing TPU distributed values.
+
+Note that the tests are in values_test.py .
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.tpu import tpu
+
+
+@contextlib.contextmanager
+def _maybe_enter_graph(tensor):
+  # Note: might have an eager tensor but not be executing eagerly when
+  # building functions.
+  if (context.executing_eagerly() or isinstance(tensor, ops.EagerTensor) or
+      ops.has_default_graph()):
+    yield
+  else:
+    with tensor.graph.as_default():
+      yield
+
+
+def _make_raw_assign_fn(raw_assign_fn):  # pylint: disable=missing-docstring
+
+  def assign_fn(var, value, use_locking=False, name=None, read_value=True):  # pylint: disable=missing-docstring
+    del use_locking  # Unused.
+
+    with _maybe_enter_graph(var.handle):
+      op = raw_assign_fn(
+          var.handle, ops.convert_to_tensor(value, dtype=var.dtype), name=name)
+
+      with ops.control_dependencies([op]):
+        return var._read_variable_op() if read_value else op  # pylint: disable=protected-access
+
+  return assign_fn
+
+
+class TPUVariableMixin(object):
+  """Mixin for TPU variables."""
+
+  def __init__(self, *args, **kwargs):
+    super(TPUVariableMixin, self).__init__(*args, **kwargs)
+
+    # Handle ID is needed for `get_replicated_var_handle` to cache the variables
+    # correctly since in eager mode different variables can have the same name.
+    if ops.executing_eagerly_outside_functions():
+      self._handle_id = self._common_name + "_" + str(id(self._primary))
+    else:
+      self._handle_id = self._common_name
+
+  def __getattr__(self, name):
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self).__getattr__(name)
+    else:
+      raise AttributeError(
+          "'{}' not accessible within a TPU context.".format(name))
+
+  def get(self):
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self).get()
+    else:
+      raise NotImplementedError(
+          "`TPUVariableMixin.get()` is not supported within a TPU context.")
+
+  def _get_as_operand(self):
+    return self.read_value()
+
+  def _get_closest(self):
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self)._get_closest()
+    else:
+      return self._primary
+
+  def numpy(self):
+    if context.executing_eagerly():
+      return self.read_value().numpy()
+    else:
+      raise NotImplementedError(
+          "numpy() is only available when eager execution is enabled.")
+
+  def _is_mirrored(self):
+    raise NotImplementedError(
+        "`TPUVariableMixin._is_mirrored()` must be implemented by subclasses.")
+
+  @property
+  def handle(self):
+    # If we're in a tpu.rewrite(), return the replicated handle.
+    tpu_context = enclosing_tpu_context()
+    if tpu_context is None:
+      return self._get_closest().handle
+    else:
+      return tpu_context.get_replicated_var_handle(self._handle_id,
+                                                   self._values,
+                                                   self._is_mirrored())
+
+  @property
+  def device(self):
+    return self.handle.device
+
+  def _read_variable_op(self):
+    if self.trainable:
+      tape.variable_accessed(self)
+    return gen_resource_variable_ops.read_variable_op(self.handle, self.dtype)
+
+  def read_value(self):
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self).read_value()
+    else:
+      return self._read_variable_op()
+
+  def value(self):
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self).value()
+    else:
+      return self._read_variable_op()
+
+  def _as_graph_element(self):
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self)._as_graph_element()  # pylint: disable=protected-access
+    else:
+      return None
+
+  @property
+  def op(self):
+    return values.DistributedVarOp(self._primary.op.name,
+                                   self._primary.op.graph,
+                                   self._primary.op.traceback,
+                                   self._primary.op.type)
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    # pylint: disable=protected-access
+    if enclosing_tpu_context() is None:
+      return super(TPUVariableMixin, self)._dense_var_to_tensor(
+          dtype=dtype, name=name, as_ref=as_ref)
+    # pylint: enable=protected-access
+    elif dtype is not None and dtype != self.dtype:
+      return math_ops.cast(self.read_value(), dtype)
+    else:
+      return self.handle if as_ref else self.read_value()
+
+
+def enclosing_tpu_context():
+  """Returns the TPUReplicateContext, which exists inside a tpu.rewrite()."""
+  graph = ops.get_default_graph()
+  while graph is not None:
+    # pylint: disable=protected-access
+    context_ = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+    while context_ is not None:
+      if isinstance(context_, tpu.TPUReplicateContext):
+        return context_
+      context_ = context_.outer_context
+    # This may be a FuncGraph due to defuns or v2 control flow. We need to
+    # find the original graph with the XLAControlFlowContext.
+    graph = getattr(graph, "outer_graph", None)
+  return None
+
+
+class TPUMirroredVariable(TPUVariableMixin, values.MirroredVariable):
+  """Holds a map from replica to TPU variables whose values are kept in sync."""
+
+  def _assign_func(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if (ds_context.in_cross_replica_context() and
+          (enclosing_tpu_context() is not None)):
+        f = kwargs.pop("f")
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
+      else:
+        return values.MirroredVariable._assign_func(self, *args, **kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = _make_raw_assign_fn(
+        gen_resource_variable_ops.assign_sub_variable_op)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = _make_raw_assign_fn(
+        gen_resource_variable_ops.assign_add_variable_op)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = _make_raw_assign_fn(
+        gen_resource_variable_ops.assign_variable_op)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  def _is_mirrored(self):
+    return True
+
+
+class TPUSyncOnReadVariable(TPUVariableMixin, values.SyncOnReadVariable):
+  """Holds a map from replica to variables whose values are reduced on save."""
+
+  def assign_sub(self, *args, **kwargs):
+    if enclosing_tpu_context() is None:
+      return values.SyncOnReadVariable.assign_sub(self, *args, **kwargs)
+    else:
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_sub_variable_op)(self, *args,
+                                                            **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    if enclosing_tpu_context() is None:
+      return values.SyncOnReadVariable.assign_add(self, *args, **kwargs)
+    else:
+      return _make_raw_assign_fn(
+          gen_resource_variable_ops.assign_add_variable_op)(self, *args,
+                                                            **kwargs)
+
+  def assign(self, *args, **kwargs):
+    if enclosing_tpu_context() is None:
+      return values.SyncOnReadVariable.assign(self, *args, **kwargs)
+    else:
+      return _make_raw_assign_fn(gen_resource_variable_ops.assign_variable_op)(
+          self, *args, **kwargs)
+
+  def _is_mirrored(self):
+    return False
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 74e9c600cee..c23819cde11 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -19,12 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import contextlib
 import weakref
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -34,19 +33,19 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.tpu import tpu
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
-  replica_context = distribution_strategy_context.get_replica_context()
+  replica_context = ds_context.get_replica_context()
   if replica_context:
     replica_id = replica_context.replica_id_in_sync_group
     if not isinstance(replica_id, int):
@@ -56,10 +55,73 @@ def _get_current_replica_id_as_int():
   return replica_id
 
 
+@tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
-  """Holds a map from replica to values. Either PerReplica or Mirrored."""
+  """Base class for representing distributed values.
+
+  A subclass instance of DistributedValues is created when creating variables
+  within a distribution strategy, iterating a `tf.Dataset` or through
+  `strategy.experimental_run_v2`.  This base class should never be instantiated
+  directly.  DistributedValues contains a value per replica.  Depending on
+  the subclass, the values could either be synced on update, synced on demand,
+  or never synced.
+
+  DistributedValues can be reduced to obtain single value across replicas,
+  as input into `experimental_run_v2` or the per replica values inspected
+  using `experimental_local_results`.
+
+  Example usage:
+
+  1. Created from Dataset:
+
+  >>> strategy = tf.distribute.MirroredStrategy()
+  >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
+  >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+  >>> distributed_values = next(dataset_iterator)
+
+  2. Returned by `experimental_run_v2`:
+
+  >>> strategy = tf.distribute.MirroredStrategy()
+  >>> @tf.function
+  ... def run():
+  ...   ctx = tf.distribute.get_replica_context()
+  ...   return ctx.replica_id_in_sync_group
+  >>> distributed_values = strategy.experimental_run_v2(run)
+
+  3. As input into `experimental_run_v2`:
+  >>> strategy = tf.distribute.MirroredStrategy()
+  >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
+  >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+  >>> distributed_values = next(dataset_iterator)
+  >>> @tf.function
+  ... def run(input):
+  ...   return input + 1.0
+  >>> updated_value = strategy.experimental_run_v2(run,
+  ...                                              args=(distributed_values,))
+
+  4. Reduce value
+  >>> strategy = tf.distribute.MirroredStrategy()
+  >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
+  >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+  >>> distributed_values = next(dataset_iterator)
+  >>> reduced_value = strategy.reduce(tf.distribute.ReduceOp.SUM,
+  ...                                 distributed_values,
+  ...                                 axis = 0)
+
+  5. Inspect per replica values.
+  >>> strategy = tf.distribute.MirroredStrategy()
+  >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
+  >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+  >>> per_replica_values = strategy.experimental_local_results(
+  ...    distributed_values)
+  >>> per_replica_values
+  (<tf.Tensor: shape=(2,), dtype=float32,
+   numpy=array([5., 6.], dtype=float32)>,)
+
+  """
 
   def __init__(self, values):
+    """Should only be called by subclass __init__."""
     self._values = tuple(values)
 
   def _get(self):
@@ -297,7 +359,7 @@ class PerReplicaSpec(type_spec.TypeSpec):
     return self._value_specs
 
   def _to_components(self, value):
-    replica_context = distribution_strategy_context.get_replica_context()
+    replica_context = ds_context.get_replica_context()
     if replica_context is not None and replica_context.num_replicas_in_sync > 1:
       raise ValueError(
           "Flattening a PerReplica to components is not supported in replica "
@@ -340,27 +402,6 @@ def _assign_sub_on_device(device, variable, tensor):
     return variable.assign_sub(tensor)
 
 
-def _assert_strategy(strategy):
-  if not distribution_strategy_context.has_strategy():
-    raise RuntimeError('Need to be inside "with strategy.scope()" for %s' %
-                       (strategy,))
-  current_strategy = distribution_strategy_context.get_strategy()
-  if current_strategy is not strategy:
-    raise RuntimeError(
-        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
-        (current_strategy, strategy))
-
-
-@contextlib.contextmanager
-def _enter_or_assert_strategy(strategy):
-  if not distribution_strategy_context.has_strategy():
-    with strategy.scope():
-      yield
-  else:
-    _assert_strategy(strategy)
-    yield
-
-
 DistributedVarOp = collections.namedtuple(
     "DistributedVarOp", ["name", "graph", "traceback", "type"])
 
@@ -513,7 +554,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     # We want cross-replica code that does some var.op.X calls
     # to work (even if the current device isn't in self._devices), but
     # other uses of var.op in a cross-replica context to fail.
-    if distribution_strategy_context.in_cross_replica_context():
+    if ds_context.in_cross_replica_context():
       return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
                               self._primary.op.traceback, self._primary.op.type)
     return self._get().op
@@ -523,7 +564,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     return self._primary._in_graph_mode  # pylint: disable=protected-access
 
   def read_value(self):
-    with _enter_or_assert_strategy(self._distribute_strategy):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return array_ops.identity(self._get())
 
   def value(self):
@@ -537,135 +578,6 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
-@contextlib.contextmanager
-def _maybe_enter_graph(tensor):
-  # Note: might have an eager tensor but not be executing eagerly when
-  # building functions.
-  if (context.executing_eagerly() or isinstance(tensor, ops.EagerTensor) or
-      ops.has_default_graph()):
-    yield
-  else:
-    with tensor.graph.as_default():
-      yield
-
-
-def _make_raw_assign_fn(raw_assign_fn):  # pylint: disable=missing-docstring
-
-  def assign_fn(var, value, use_locking=False, name=None, read_value=True):  # pylint: disable=missing-docstring
-    del use_locking  # Unused.
-
-    with _maybe_enter_graph(var.handle):
-      op = raw_assign_fn(
-          var.handle, ops.convert_to_tensor(value, dtype=var.dtype), name=name)
-
-      with ops.control_dependencies([op]):
-        return var._read_variable_op() if read_value else op  # pylint: disable=protected-access
-
-  return assign_fn
-
-
-class TPUVariableMixin(object):
-  """Mixin for TPU variables."""
-
-  def __init__(self, *args, **kwargs):
-    super(TPUVariableMixin, self).__init__(*args, **kwargs)
-
-    # Handle ID is needed for `get_replicated_var_handle` to cache the variables
-    # correctly since in eager mode different variables can have the same name.
-    if ops.executing_eagerly_outside_functions():
-      self._handle_id = self._common_name + "_" + str(id(self._primary))
-    else:
-      self._handle_id = self._common_name
-
-  def __getattr__(self, name):
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self).__getattr__(name)
-    else:
-      raise AttributeError(
-          "'{}' not accessible within a TPU context.".format(name))
-
-  def get(self):
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self).get()
-    else:
-      raise NotImplementedError(
-          "`TPUVariableMixin.get()` is not supported within a TPU context.")
-
-  def _get_as_operand(self):
-    return self.read_value()
-
-  def _get_closest(self):
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self)._get_closest()
-    else:
-      return self._primary
-
-  def numpy(self):
-    if context.executing_eagerly():
-      return self.read_value().numpy()
-    else:
-      raise NotImplementedError(
-          "numpy() is only available when eager execution is enabled.")
-
-  def _is_mirrored(self):
-    raise NotImplementedError(
-        "`TPUVariableMixin._is_mirrored()` must be implemented by subclasses.")
-
-  @property
-  def handle(self):
-    # If we're in a tpu.rewrite(), return the replicated handle.
-    tpu_context = _enclosing_tpu_context()
-    if tpu_context is None:
-      return self._get_closest().handle
-    else:
-      return tpu_context.get_replicated_var_handle(
-          self._handle_id, self._values, self._is_mirrored())
-
-  @property
-  def device(self):
-    return self.handle.device
-
-  def _read_variable_op(self):
-    if self.trainable:
-      tape.variable_accessed(self)
-    return gen_resource_variable_ops.read_variable_op(self.handle, self.dtype)
-
-  def read_value(self):
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self).read_value()
-    else:
-      return self._read_variable_op()
-
-  def value(self):
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self).value()
-    else:
-      return self._read_variable_op()
-
-  def _as_graph_element(self):
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self)._as_graph_element()  # pylint: disable=protected-access
-    else:
-      return None
-
-  @property
-  def op(self):
-    return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
-                            self._primary.op.traceback, self._primary.op.type)
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    # pylint: disable=protected-access
-    if _enclosing_tpu_context() is None:
-      return super(TPUVariableMixin, self)._dense_var_to_tensor(
-          dtype=dtype, name=name, as_ref=as_ref)
-    # pylint: enable=protected-access
-    elif dtype is not None and dtype != self.dtype:
-      return math_ops.cast(self.read_value(), dtype)
-    else:
-      return self.handle if as_ref else self.read_value()
-
-
 def _validate_colocate_extended(v, extended):
   variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
   if variable_strategy.extended is not extended:
@@ -719,7 +631,7 @@ _aggregation_error_msg = (
     "using `tf.distribute.StrategyExtended.update()`.")
 
 
-class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
+class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
   def __init__(self, mirrored_variable, primary_variable, name):
@@ -823,9 +735,9 @@ class MirroredVariable(DistributedVariable, Mirrored):
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    with _enter_or_assert_strategy(self._distribute_strategy):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       f = kwargs.pop("f")
-      if distribution_strategy_context.in_cross_replica_context():
+      if ds_context.in_cross_replica_context():
         update_replica_id = distribute_lib.get_update_replica_id()
         if update_replica_id is not None:
           # We are calling an assign function on the mirrored variable in an
@@ -868,7 +780,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
           return strategy.extended.update(
               self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
-        return distribution_strategy_context.get_replica_context().merge_call(
+        return ds_context.get_replica_context().merge_call(
             merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
@@ -938,61 +850,12 @@ ops.register_tensor_conversion_function(Mirrored,
                                         _tensor_conversion_mirrored_val)
 
 
-def _enclosing_tpu_context():
-  """Returns the TPUReplicateContext, which exists inside a tpu.rewrite()."""
-  graph = ops.get_default_graph()
-  while graph is not None:
-    # pylint: disable=protected-access
-    context_ = graph._get_control_flow_context()
-    # pylint: enable=protected-access
-    while context_ is not None:
-      if isinstance(context_, tpu.TPUReplicateContext):
-        return context_
-      context_ = context_.outer_context
-    # This may be a FuncGraph due to defuns or v2 control flow. We need to
-    # find the original graph with the XLAControlFlowContext.
-    graph = getattr(graph, "outer_graph", None)
-  return None
-
-
 def is_distributed_variable(v):
   """Determine if a variable is ds variable or TPU mirrored variable."""
   return isinstance(v, DistributedVariable)
 
 
-class TPUMirroredVariable(TPUVariableMixin, MirroredVariable):
-  """Holds a map from replica to TPU variables whose values are kept in sync."""
-
-  def _assign_func(self, *args, **kwargs):
-    with _enter_or_assert_strategy(self._distribute_strategy):
-      if (distribution_strategy_context.in_cross_replica_context() and
-          (_enclosing_tpu_context() is not None)):
-        f = kwargs.pop("f")
-        return self._distribute_strategy.extended.update(
-            self, f, args=args, kwargs=kwargs)
-      else:
-        return MirroredVariable._assign_func(self, *args, **kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = _make_raw_assign_fn(
-        gen_resource_variable_ops.assign_sub_variable_op)
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = _make_raw_assign_fn(
-        gen_resource_variable_ops.assign_add_variable_op)
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = _make_raw_assign_fn(
-        gen_resource_variable_ops.assign_variable_op)
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  def _is_mirrored(self):
-    return True
-
-
-class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
+class _SyncOnReadSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a SyncOnReadVariable."""
 
   def __init__(self, sync_on_read_variable, name):
@@ -1004,7 +867,7 @@ class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
       strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
       return strategy.extended.read_var(sync_on_read_variable)
 
-    spec = saver.BaseSaverBuilder.SaveSpec(
+    spec = saveable_object.SaveSpec(
         tensor=tensor,
         slice_spec="",
         name=name,
@@ -1029,7 +892,7 @@ class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
 
 
 def _assert_replica_context(strategy):
-  replica_context = distribution_strategy_context.get_replica_context()
+  replica_context = ds_context.get_replica_context()
   if not replica_context:
     raise RuntimeError(
         "Replica-local variables may only be assigned in a replica context.")
@@ -1046,8 +909,8 @@ class SyncOnReadVariable(DistributedVariable):
     self._aggregation = aggregation
 
   def assign_sub(self, *args, **kwargs):
-    with _enter_or_assert_strategy(self._distribute_strategy):
-      if distribution_strategy_context.in_cross_replica_context():
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
         if self._aggregation == vs.VariableAggregation.SUM:
           raise ValueError(
               "SyncOnReadVariable does not support `assign_sub` in "
@@ -1061,8 +924,8 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get().assign_sub(*args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
-    with _enter_or_assert_strategy(self._distribute_strategy):
-      if distribution_strategy_context.in_cross_replica_context():
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
         if self._aggregation == vs.VariableAggregation.SUM:
           raise ValueError(
               "SyncOnReadVariable does not support `assign_add` in "
@@ -1076,8 +939,8 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    with _enter_or_assert_strategy(self._distribute_strategy):
-      if distribution_strategy_context.in_cross_replica_context():
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
         # To preserve the sum across save and restore, we have to divide the
         # total across all devices when restoring a variable that was summed
         # when saving.
@@ -1090,8 +953,8 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get().assign(*args, **kwargs)
 
   def value(self):
-    with _enter_or_assert_strategy(self._distribute_strategy):
-      if distribution_strategy_context.in_cross_replica_context():
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
         return self._get_cross_replica()
       else:
         # _get_closest() returns a Variable.
@@ -1112,7 +975,7 @@ class SyncOnReadVariable(DistributedVariable):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       return self._primary
 
-    with _enter_or_assert_strategy(self._distribute_strategy):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self.aggregation),
           self,
@@ -1120,8 +983,8 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    with _enter_or_assert_strategy(self._distribute_strategy):
-      if distribution_strategy_context.in_cross_replica_context():
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      if ds_context.in_cross_replica_context():
         return ops.convert_to_tensor(self._get_cross_replica())
     return self._get()._as_graph_element()
 
@@ -1142,7 +1005,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
-    with _enter_or_assert_strategy(self._distribute_strategy):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
@@ -1157,36 +1020,6 @@ ops.register_tensor_conversion_function(SyncOnReadVariable,
                                         _tensor_conversion_sync_on_read)
 
 
-class TPUSyncOnReadVariable(TPUVariableMixin, SyncOnReadVariable):
-  """Holds a map from replica to variables whose values are reduced on save."""
-
-  def assign_sub(self, *args, **kwargs):
-    if _enclosing_tpu_context() is None:
-      return SyncOnReadVariable.assign_sub(self, *args, **kwargs)
-    else:
-      return _make_raw_assign_fn(
-          gen_resource_variable_ops.assign_sub_variable_op)(self, *args,
-                                                            **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    if _enclosing_tpu_context() is None:
-      return SyncOnReadVariable.assign_add(self, *args, **kwargs)
-    else:
-      return _make_raw_assign_fn(
-          gen_resource_variable_ops.assign_add_variable_op)(self, *args,
-                                                            **kwargs)
-
-  def assign(self, *args, **kwargs):
-    if _enclosing_tpu_context() is None:
-      return SyncOnReadVariable.assign(self, *args, **kwargs)
-    else:
-      return _make_raw_assign_fn(gen_resource_variable_ops.assign_variable_op)(
-          self, *args, **kwargs)
-
-  def _is_mirrored(self):
-    return False
-
-
 def regroup(values, wrap_class=PerReplica):
   """Makes a nest per-replica into a nest of PerReplica/Mirrored values."""
   v0 = values[0]
@@ -1379,9 +1212,9 @@ class AggregatingVariable(variables_lib.Variable):
     return getattr(self._v, name)
 
   def _assign_func(self, *args, **kwargs):
-    with _enter_or_assert_strategy(self._distribute_strategy):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       f = kwargs.pop("f")
-      if distribution_strategy_context.in_cross_replica_context():
+      if ds_context.in_cross_replica_context():
         if distribute_lib.get_update_replica_id() is not None:
           # We are calling an assign function in an update context.
           return f(self._v, *args, **kwargs)
@@ -1391,7 +1224,7 @@ class AggregatingVariable(variables_lib.Variable):
         return self._distribute_strategy.extended.update(
             self, f, args=args, kwargs=kwargs)
       else:
-        replica_context = distribution_strategy_context.get_replica_context()
+        replica_context = ds_context.get_replica_context()
         assert replica_context
         # We are calling an assign function in replica context.
         # We reduce the value we want to assign/add/sub. More details about how
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index f2922e6e53a..d66726424a1 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
@@ -824,7 +825,7 @@ def _make_replica_local(method, strategy=None):
           name=n, initializer=init, use_resource=True))
 
   if (strategy is not None) and isinstance(strategy, _TPU_STRATEGIES):
-    var_cls = values.TPUSyncOnReadVariable
+    var_cls = tpu_values.TPUSyncOnReadVariable
   else:
     var_cls = values.SyncOnReadVariable
   replica_local = var_cls(strategy, v, method)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index d87c157d1e6..cd58565f5bb 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -621,8 +621,25 @@ class Context(object):
     else:
       raise ValueError("Context is not initialized.")
 
-  def clear_remote_executors(self):
-    """Clear executors on remote workers.
+  def sync_executors(self):
+    """Sync both local executors and the ones on remote workers.
+
+    In async execution mode, local function calls can return before the
+    coresponding remote op/function execution requests are completed. Calling
+    this method creates a synchronization barrier for remote executors. It only
+    returns when all remote pending nodes are finished, potentially with errors
+    if any remote executors are in error state.
+
+    Raises:
+      ValueError: if context is not initialized.
+    """
+    if self._context_handle:
+      pywrap_tfe.TFE_ContextSyncExecutors(self._context_handle)
+    else:
+      raise ValueError("Context is not initialized.")
+
+  def clear_executor_errors(self):
+    """Clear errors in both local executors and remote workers.
 
     After receiving errors from remote workers, additional requests on the fly
     could further taint the status on the remote workers due to the async nature
@@ -633,7 +650,7 @@ class Context(object):
       ValueError: if context is not initialized.
     """
     if self._context_handle:
-      pywrap_tfe.TFE_ContextClearRemoteExecutors(self._context_handle)
+      pywrap_tfe.TFE_ContextClearExecutors(self._context_handle)
     else:
       raise ValueError("Context is not initialized.")
 
@@ -2019,16 +2036,6 @@ def is_async():
   return context().is_async()
 
 
-def async_wait():
-  """Waits for ops dispatched in ASYNC mode to finish."""
-  return context().executor.wait()
-
-
-def async_clear_error():
-  """Clears errors raised during ASYNC execution mode."""
-  return context().executor.clear_error()
-
-
 def num_gpus():
   """Get the number of available GPU devices.
 
@@ -2135,6 +2142,65 @@ def check_alive(worker_name):
   return context().check_alive(worker_name)
 
 
+def async_wait():
+  """Sync all async operations and raise any errors during execution.
+
+  In async execution mode, an op/function call can return before finishing the
+  actual execution. Calling this method creates a synchronization barrier for
+  all async op and function execution. It only returns when all pending nodes
+  are finished, potentially raising exceptions if async execution results in
+  an error state.
+
+  Users may write the following code to asynchronuously invoke `train_step_fn`
+  and log the `loss` metric for every `num_steps` steps in a training loop.
+  `train_step_fn` internally consumes data using `iterator.get_next()`, and may
+  throw OutOfRangeError when running out of data. In the case:
+    - If the exception is thrown during the loop of scheduling function steps,
+      the next call to function triggers an exception. In the except block,
+      we clear the error and break from the loop;
+    - If all `train_step_fn`s are scheduled before throwing an exception, we
+      block at the last iteration to wait for the scheduled functions to finish
+      excution and throw the OutOfRangeError.
+
+  ```
+  for i in range(num_steps):
+    try:
+      # Step function updates the metric `loss` internally
+      train_step_fn()
+      if i == num_steps - 1:
+        context.async_wait()
+    except tf.errors.OutOfRangeError:
+      context.async_clear_error()
+      break
+  logging.info('loss =', loss.numpy())
+  ```
+  """
+  context().sync_executors()
+
+
+def async_clear_error():
+  """Clear pending operations and error statuses in async execution.
+
+  In async execution mode, an error in op/function execution can lead to errors
+  in subsequent ops/functions that are scheduled but not yet executed. Calling
+  this method clears all pending operations and reset the async execution state.
+
+  Example:
+
+  ```
+  while True:
+    try:
+      # Step function updates the metric `loss` internally
+      train_step_fn()
+    except tf.errors.OutOfRangeError:
+      context.async_clear_error()
+      break
+  logging.info('loss =', loss.numpy())
+  ```
+  """
+  context().clear_executor_errors()
+
+
 def add_function(fdef):
   """Add a function definition to the context."""
   context().add_function(fdef)
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 8993efd4085..47b3966827f 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -63,7 +63,7 @@ def truncated_normal(shape):
 
 
 def current_device():
-  return constant_op.constant(1.).device
+  return array_ops.identity(1.).device
 
 
 def configure_virtual_cpus():
@@ -79,6 +79,7 @@ class TFETest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     super(TFETest, self).setUp()
+    context._reset_context()
     configure_virtual_cpus()
 
   def _test_hashable(self, a, b, hashable):
@@ -393,21 +394,23 @@ class TFETest(test_util.TensorFlowTestCase):
 
   def testMultiCpuPlacement(self):
     with ops.device('cpu:1'):
-      x = constant_op.constant(1.0)
-    y = array_ops.identity(x)
+      x = array_ops.identity(1.0)
+    with ops.device('cpu:0'):
+      y = array_ops.identity(x)
     self.assertEqual(x.device, '/job:localhost/replica:0/task:0/device:CPU:1')
     self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
 
   @test_util.run_gpu_only
   def testShouldCopy(self):
-    with ops.device('gpu:0'):
-      x = constant_op.constant(1.0)
+    with ops.device('GPU:0'):
+      x = array_ops.identity(1.0)
+      self.assertEqual(x.device, '/job:localhost/replica:0/task:0/device:GPU:0')
     y = array_ops.identity(x)
     # The value we're testing y.device against will depend on what the behavior
     # of not explicitly specifying a device in the context is.  This behavior is
     # subject to change (for example, in the future we may want to use GPUs, if
     # available, when no device is explicitly provided)
-    self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
+    self.assertEqual(y.device, current_device())
 
   def testContextSwitchStackContainsEagerMode(self):
     # Eager execution has been enabled, and no other context switch has
@@ -488,6 +491,7 @@ class TFETest(test_util.TensorFlowTestCase):
       self.assertEndsWith(current_device(), 'GPU:0')
     gpu.__exit__()
     self.assertEndsWith(current_device(), 'CPU:0')
+    cpu.__exit__()
 
   @test_util.run_gpu_only
   def testReEntrant(self):
@@ -563,12 +567,14 @@ class TFETest(test_util.TensorFlowTestCase):
     def simple_fn(unused_handle):
       return 1.
 
+    with ops.device('CPU:0'):
+      test_var = variables.Variable([2., 3.])
+
     @def_function.function
     def test_fn(v):
       script_ops.eager_py_func(simple_fn, [v.handle], dtypes.float32)
       return 1.
 
-    test_var = variables.Variable([2., 3.])
     self.assertAllEqual(test_fn(test_var), 1.0)
 
   def testPyFunctionAsync(self):
@@ -627,15 +633,20 @@ class TFETest(test_util.TensorFlowTestCase):
           attrs=('T', three.dtype.as_datatype_enum))[0]
       self.assertAllEqual(15, product)
     # Error: Invalid arguments
-    context.set_execution_mode(context.ASYNC)
-    with self.assertRaises(errors.InvalidArgumentError):
-      execute(
-          b'MatMul',
-          num_outputs=1,
-          inputs=[three, five],
-          attrs=('transpose_a', False, 'transpose_b', False, 'T',
-                 three.dtype.as_datatype_enum))
-      context.context().executor.wait()
+    # TODO(b/149995282): When an exception is thrown in ASYNC mode, it seems
+    # there are things left over that cause mutex corruption when
+    # _reset_context() is called before the next test is executed.
+    #
+    # context.set_execution_mode(context.ASYNC)
+    # with self.assertRaises(errors.InvalidArgumentError):
+    #   execute(
+    #       b'MatMul',
+    #       num_outputs=1,
+    #       inputs=[three, five],
+    #       attrs=('transpose_a', False, 'transpose_b', False, 'T',
+    #              three.dtype.as_datatype_enum))
+    #   context.context().executor.wait()
+    #
     context.context().executor.clear_error()
     context.context().execution_mode = context.SYNC
 
@@ -1014,7 +1025,8 @@ class TFETest(test_util.TensorFlowTestCase):
       t.join()
 
   def testEmptyResourceReturned(self):
-    v = variables.Variable(1.)
+    with ops.device('CPU:0'):
+      v = variables.Variable(1.)
     empty_handle = array_ops.gather(
         v.handle[array_ops.newaxis], array_ops.zeros([0], dtype=dtypes.int32))
     self.assertEqual(
@@ -1051,6 +1063,7 @@ class SendRecvTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     super(SendRecvTest, self).setUp()
+    context._reset_context()
     configure_virtual_cpus()
 
   def testBasic(self):
@@ -1069,7 +1082,7 @@ class SendRecvTest(test_util.TensorFlowTestCase):
   def testLocalCrossDevice(self):
     gpu_device_name = '/job:localhost/replica:0/task:0/device:GPU:0'
     with ops.device('GPU:0'):
-      t0 = constant_op.constant(1.0)
+      t0 = array_ops.identity(1.0)
       self._send(t0, 't0', self.cpu_device)
     with ops.device('cpu:0'):
       self.assertAllEqual(
@@ -1086,6 +1099,7 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     super(EagerTensorCacheTest, self).setUp()
+    context._reset_context()
     configure_virtual_cpus()
 
   def testCacheSkipsTensorsTooLarge(self):
@@ -1098,4 +1112,5 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == '__main__':
+  context.set_log_device_placement(True)
   test.main()
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 76af2d32c3e..6c116aa26c3 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -382,20 +382,20 @@ class Function(object):
         conversion options when autograph is set to True.
       experimental_relax_shapes: When true, argument shapes may be relaxed to
         avoid unnecessary retracing.
-      experimental_compile: If false, execute the function in a regular way. The
-        function is optimized by some graph rewrite passes (some ops might be
-        clustered into a single op) and interpreted by the standard TensorFlow
-        executor, which dispatches op kernels one by one as they become
-        executable. Set it to false when directly running a multi-device
-        function on TPUs (e.g. two TPU cores, one TPU core and its
-        host CPU). If True, the function is compiled directly by XLA. XLA would
-        fuse all the ops and emit more efficient code to run for some devices
-        (e.g. TPU, XLA_GPU) and some use cases (e.g. dense tensor computation).
-        It requires that the whole function is compilable by XLA. If None
-        (default), compile the function with XLA when running on TPU and go
-        through the regular function execution path when running on other
-        devices.
-
+      experimental_compile: If `True`, compiles the function using XLA
+        (see https://tensorflow.org/xla). XLA performs compiler optimizations,
+        such as fusion, and attempts to emit more efficient code. This may
+        drastically improve the performance. If set to `True`,
+        the whole function needs to be compilable by XLA, or an
+        `errors.InvalidArgumentError` is thrown.
+        If `None` (default), compiles the function with XLA when running on TPU
+        and goes through the regular function execution path when running on
+        other devices.
+        If `False`, executes the function in a regular way (graph rewrite
+        passes are applied, kernels are dispatched one-by-one by the TensorFlow
+        executor). Set this value to `False` when directly running a
+        multi-device function on TPUs (e.g. two TPU cores, one TPU core and its
+        host CPU).
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
         argspec has keyword arguments.
diff --git a/tensorflow/python/eager/device_placement_test.py b/tensorflow/python/eager/device_placement_test.py
index 4318313c597..af6c68243b4 100644
--- a/tensorflow/python/eager/device_placement_test.py
+++ b/tensorflow/python/eager/device_placement_test.py
@@ -18,21 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 
 
-class SoftDevicePlacementTest(test.TestCase):
+class SoftDevicePlacementTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
-    context.context().soft_device_placement = True
+    super(SoftDevicePlacementTest, self).setUp()
+    context._reset_context()
+    config.set_soft_device_placement(enabled=True)
     context.context().log_device_placement = True
 
   @test_util.run_gpu_only
@@ -86,11 +92,60 @@ class SoftDevicePlacementTest(test.TestCase):
     # We don't support nested device placement right now.
     self.assertIn('GPU:0', c.device)
 
+  @parameterized.named_parameters(('float', 1.0, None),
+                                  ('int32', [1], dtypes.int32),
+                                  ('string', ['a'], None))
+  def testSoftPlacedCPUConstant(self, value, dtype):
+    with ops.device('GPU:0'):
+      a = constant_op.constant(value, dtype=dtype)
+    self.assertIn('CPU:0', a.device)
+    self.assertIn('CPU:0', a.backing_device)
+
+
+class HardDevicePlacementTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(HardDevicePlacementTest, self).setUp()
+    context._reset_context()
+    config.set_soft_device_placement(enabled=False)
+    context.context().log_device_placement = True
+    self.assertEqual(config.get_soft_device_placement(), False)
+    self.assertEqual(context.context().soft_device_placement, False)
+
+  @test_util.run_gpu_only
+  def testIdentityCanCopy(self):
+    config.set_device_policy('explicit')
+    with ops.device('CPU:0'):
+      x = constant_op.constant(1.0)
+      self.assertIn('CPU:0', x.device)
+      self.assertIn('CPU:0', x.backing_device)
+    with ops.device('GPU:0'):
+      y = array_ops.identity(x)
+      self.assertIn('GPU:0', y.device)
+      self.assertIn('GPU:0', y.backing_device)
+
+  @parameterized.named_parameters(('float_cpu0', 'CPU:0', 1.0, None),
+                                  ('int32_cpu0', 'CPU:0', [1], dtypes.int32),
+                                  ('string_cpu0', 'CPU:0', ['a'], None),
+                                  ('float_gpu0', 'GPU:0', 1.0, None),
+                                  ('int32_gpu0', 'GPU:0', [1], dtypes.int32),
+                                  ('string_gpu0', 'GPU:0', ['a'], None),
+                                  ('float_gpu99', 'GPU:99', 1.0, None),
+                                  ('int32_gpu99', 'GPU:99', [1], dtypes.int32),
+                                  ('string_gpu99', 'GPU:99', ['a'], None))
+  def testHardPlacedCPUConstant(self, device, value, dtype):
+    with ops.device(device):
+      a = constant_op.constant(value, dtype=dtype)
+      self.assertIn('CPU:0', a.device)
+      self.assertIn('CPU:0', a.backing_device)
+
 
 class ClusterPlacementTest(test.TestCase):
 
   def setUp(self):
-    context.context().soft_device_placement = True
+    super(ClusterPlacementTest, self).setUp()
+    context._reset_context()
+    config.set_soft_device_placement(enabled=True)
     context.context().log_device_placement = True
     workers, _ = test_util.create_local_cluster(2, 0)
     remote.connect_to_remote_host([workers[0].target, workers[1].target])
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index fed04aec270..71473e51706 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -230,6 +230,14 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         ))
     self.assertAllClose([2. * 5. + 3. * 4.], self.evaluate(vp))
 
+  def testNonDifferentiableOpWithInputTangent(self):
+    x = constant_op.constant(1.)
+    with forwardprop.ForwardAccumulator(x, 2.) as acc1:
+      with forwardprop.ForwardAccumulator(x, 2.) as acc2:
+        y = array_ops.zeros_like(x)
+      self.assertIsNone(acc1.jvp(y))
+    self.assertIsNone(acc2.jvp(y))
+
   def testJVPFunctionUsedByAccumulatorForOps(self):
     previous_fn = forwardprop._jvp_dispatch
     try:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 895a5de7765..c16060422b8 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -981,7 +981,7 @@ class _TapeGradientFunctions(object):
         self._func_graph.outputs,
         forward_function_attr)
 
-    if not self._func_graph.outputs or not input_tangents:
+    if not input_tangents:
       # There is no need to special-case forwardprop, so we can return the
       # forward+backward pair we've created without further wrapping.
       return (forward_function, self._func_graph, backward_function,
@@ -1085,6 +1085,11 @@ class _TapeGradientFunctions(object):
              "StatefulPartitionedCall": gradient_function}):
           forward_outputs = forward_function.call(context.context(),
                                                   forward_inputs)
+          if isinstance(forward_outputs, ops.Operation):
+            # _wrapped_backward_function expects a list, but if the function has
+            # no outputs its call() returns an Operation. We need to undo that
+            # so we don't cause problems later.
+            forward_outputs = []
         py_backward, _ = self._wrap_backward_function(
             self._func_graph, backward_function, forward_outputs)
       # We will never request backward tape gradients for this operation
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 0a34d4a3852..7b599a995e2 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1570,10 +1570,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testColocateWithRespected(self):
     # TODO(b/113291792): Use multiple CPUs instead of a GPU.
     with ops.device('cpu:0'):
-      x = constant_op.constant(1.0)
+      x = array_ops.identity(1.0)
 
     with ops.device('gpu:0'):
-      y = constant_op.constant(1.0)
+      y = array_ops.identity(1.0)
 
     @def_function.function
     def foo():
@@ -3239,9 +3239,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
       return b, a
 
     with ops.device('/device:CPU:0'):
-      a = constant_op.constant(3.0)
+      a = array_ops.identity(3.0)
     with ops.device('/device:GPU:0'):
-      b = constant_op.constant(5.0)
+      b = array_ops.identity(5.0)
 
     m1, m2 = func(a, b)
     self.assertAllEqual(m1.numpy(), 5.0)
@@ -3306,9 +3306,9 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     devices = ['/device:CPU:0', '/device:GPU:0']
     for dev1, dev2 in itertools.product(devices, devices):
       with ops.device(dev1):
-        a = constant_op.constant(1.0)
+        a = array_ops.identity(1.0)
       with ops.device(dev2):
-        b = constant_op.constant(10.0)
+        b = array_ops.identity(10.0)
 
       ra, rb = func(a, b)
       self.assertEqual(ra.numpy(), 2.0)
@@ -3469,13 +3469,13 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
     with ops.device('/device:CPU:0'):
       rc0 = resource_variable_ops.ResourceVariable(2.0)
       rc1 = resource_variable_ops.ResourceVariable(3.0)
-      cc0 = constant_op.constant(5.0)
-      cc1 = constant_op.constant(7.0)
+      cc0 = array_ops.identity(5.0)
+      cc1 = array_ops.identity(7.0)
     with ops.device('/device:GPU:0'):
       rg0 = resource_variable_ops.ResourceVariable(11.0)
       rg1 = resource_variable_ops.ResourceVariable(13.0)
-      cg0 = constant_op.constant(17.0)
-      cg1 = constant_op.constant(19.0)
+      cg0 = array_ops.identity(17.0)
+      cg1 = array_ops.identity(19.0)
 
     # Make sure tensors are on expected devices.
     for tensor in [cc0, cc1]:
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index b5c9bfb6824..f8e1fb568ac 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -278,39 +278,13 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
     }
   }
 
-  // Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
-  // memory. We approximate the same behavior for eager execution - keeping
-  // int32 tensors in host memory.
+  // We always generate CPU:0 tensors, but we may need to change the device
+  // slightly, as for example from /job:localhost/... to /job:worker/...
   //
-  // We do so to preclude the need for callers into such kernels from having to
-  // explicitly place the int32 tensors in host memory. For example, without
-  // this, one needed:
-  //
-  // with tf.device('/gpu:0'):
-  //   ...// code here
-  //   with tf.device('/cpu:0'):
-  //     shape = tf.constant(...)
-  //   y = tf.random_uniform(shape)
-  //
-  // Without the CPU device block, tfe.ops.random_uniform would fail since the
-  // kernel expects the shape in host memory.
-  //
-  // With this support, we simplify the code:
-  //
-  // with tf.device('/gpu:0'):
-  //   y = tf.random_uniform(...)
-  //
-  // The approximation is not exact there are GPU kernels which do not require
-  // host memory for int32 tensors. This will lead to a discrepancy between
-  // eager and graph execution.
-  //
-  // To support remote execution copy int32 tensors to another CPU device.
-  // TODO(ashankar): Fix this.
+  // Note that this is a shallow copy and will share the underlying buffer,
+  // because we are copying to the same device.
   if (device_name != nullptr &&
-      (TFE_TensorHandleDataType(handle.get()) != TF_INT32 ||
-       strstr(device_name, "/device:CPU:0") != nullptr)) {
-    // Note that this is a shallow copy and will share the underlying buffer
-    // if copying to the same device.
+      strstr(device_name, "/device:CPU:0") != nullptr) {
     handle = make_safe(TFE_TensorHandleCopyToDevice(handle.get(), ctx,
                                                     device_name, status.get()));
     if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_RuntimeError)) {
@@ -318,6 +292,15 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
     }
   }
 
+  // We always enable implicit mirroring for constants. Without this, code
+  // written previously under the assumption that
+  //
+  //   with tf.device('GPU:0'): x = tf.constant(1.0)
+  //
+  // will be placed in the GPU will suffer a non-trivial performance regression
+  // (measured at ~20% for certain benchmarks).
+  handle->handle->EnableImplicitMirroring();
+
   return handle.release();
 }
 
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 275da732c03..44af62666ee 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -24,6 +24,7 @@ from absl.testing import parameterized
 import numpy as np
 import six
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -155,6 +156,70 @@ class SingleWorkerTest(test.TestCase, parameterized.TestCase):
       self.assertIn('Dimensions must be equal', cm.exception.args[0])
 
 
+class RemoteAsyncTest(test.TestCase):
+
+  def setUp(self):
+    super(RemoteAsyncTest, self).setUp()
+
+    workers, _ = test_util.create_local_cluster(1, 0)
+    remote.connect_to_remote_host(workers[0].target)
+
+  def tearDown(self):
+    super(RemoteAsyncTest, self).tearDown()
+
+    # Reset the context to avoid polluting other test cases.
+    context._reset_context()
+
+  def test_out_of_range_with_while_loop(self):
+
+    with ops.device('/job:worker/task:0'):
+      dataset = dataset_ops.Dataset.from_tensor_slices([1.0, 2.0])
+      dataset = dataset.batch(1, drop_remainder=False)
+      iterator = iter(dataset)
+      v = variables.Variable(1.0)
+
+    @def_function.function
+    def train_step(iterator):
+      i = next(iterator)
+      v.assign_add(math_ops.reduce_mean(i))
+
+    while True:
+      try:
+        with ops.device('/job:worker/task:0'):
+          train_step(iterator)
+      except (errors.OutOfRangeError, errors.InternalError):
+        context.async_clear_error()
+        break
+
+    self.assertAllEqual(v.numpy(), 4.0)
+
+  def test_out_of_range_with_for_loop(self):
+
+    with ops.device('/job:worker/task:0'):
+      dataset = dataset_ops.Dataset.from_tensor_slices([1.0, 2.0])
+      dataset = dataset.batch(1, drop_remainder=False)
+      iterator = iter(dataset)
+      v = variables.Variable(1.0)
+
+    @def_function.function
+    def train_step(iterator):
+      i = next(iterator)
+      v.assign_add(math_ops.reduce_mean(i))
+
+    num_steps = 3
+    for i in range(num_steps):
+      try:
+        with ops.device('/job:worker/task:0'):
+          train_step(iterator)
+        if i == num_steps - 1:
+          context.async_wait()
+      except errors.OutOfRangeError:
+        context.async_clear_error()
+        break
+
+    self.assertAllEqual(v.numpy(), 4.0)
+
+
 class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
@@ -199,6 +264,42 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
+  @test_util.eager_lazy_remote_copy_on_and_off
+  def testMultiDeviceFunctionOnRemoteDeviceWithWait(self):
+    with ops.device('/job:worker/replica:0/task:1'):
+      variable_b = variables.Variable([1.0])
+
+    @def_function.function
+    def remote_function(i):
+      x = array_ops.ones([1000, 1000])
+      for _ in range(1, 1000):
+        x = x * x
+      variable_b.assign_add(i)
+      a = 1.0 + variable_b
+      return a
+
+    @def_function.function
+    def remote_function2(i):
+      variable_b.assign_add(i)
+      a = 1.0 + variable_b
+      return a
+
+    # Runs first function:
+    # - on remote device
+    # - needs remote input
+    # - is side impacting
+    # - runs much slower
+    with ops.device('/job:worker/replica:0/task:0'):
+      remote_function(constant_op.constant([2.0]))
+
+    # Runs second function:
+    # - on remote device
+    # - is side impacting
+    # There should be a sync point here and the next function will be executed
+    # only after the first function has completed.
+    with ops.device('/job:worker/replica:0/task:2'):
+      self.assertAllEqual(remote_function2(constant_op.constant([3.0])), [7.0])
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnRemoteDevice(self):
     with ops.device('/job:worker/replica:0/task:1'):
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index dd1f049cdcc..fe4a7933a32 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -281,9 +281,9 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   @test_util.run_gpu_only
   def testStringTensorOnGPU(self):
     with ops.device("/device:GPU:0"):
-      with self.assertRaisesRegexp(
-          RuntimeError, "Can't copy Tensor with type string to device"):
-        _create_tensor("test string")
+      t = _create_tensor("test string")
+      self.assertIn("CPU", t.device)
+      self.assertIn("CPU", t.backing_device)
 
   def testInvalidUTF8ProducesReasonableError(self):
     if sys.version_info[0] < 3:
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index 72612a21cbf..2ef7d737d73 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -380,10 +381,10 @@ class DeviceTest(test.TestCase):
     with ops.device('/device:CPU:1'):
       b = constant_op.constant(1.0)
       self.evaluate(b)
-    with self.assertRaisesRegexp(RuntimeError, 'unknown device'):
-      with ops.device('/device:CPU:2'):
-        c = constant_op.constant(1.0)
-        self.evaluate(c)
+    with ops.device('/device:CPU:2'):
+      c = constant_op.constant(1.0)
+      self.evaluate(c)
+    self.assertIn('CPU:0', c.device)
 
     # Ensure we can place ops on each of the device names
     for vcpu in vcpus:
@@ -408,6 +409,7 @@ class DeviceTest(test.TestCase):
   @test_util.run_gpu_only
   @reset_eager
   def testGpuNone(self):
+    config.set_soft_device_placement(False)
     gpus = config.list_physical_devices('GPU')
     self.assertGreater(len(gpus), 0)
 
@@ -427,14 +429,16 @@ class DeviceTest(test.TestCase):
     self.assertEqual(len(config.get_visible_devices('GPU')), 0)
     self.assertEqual(len(config.list_logical_devices('XLA_GPU')), 0)
 
-    with self.assertRaisesRegexp(RuntimeError, 'unknown device'):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Could not satisfy'):
       with ops.device('/device:GPU:0'):
-        a = constant_op.constant(1.0)
+        a = array_ops.identity(1.0)
         self.evaluate(a)
 
-    with self.assertRaisesRegexp(RuntimeError, 'unknown device'):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Could not satisfy'):
       with ops.device('/device:XLA_GPU:0'):
-        a = constant_op.constant(1.0)
+        a = array_ops.identity(1.0)
         self.evaluate(a)
 
     # Modifying the visible devices is not supported
@@ -465,6 +469,7 @@ class DeviceTest(test.TestCase):
   @test_util.run_gpu_only
   @reset_eager
   def testVirtualGpu(self):
+    config.set_soft_device_placement(False)
     gpus = config.list_physical_devices('GPU')
     self.assertNotEqual(len(gpus), 0)
 
@@ -479,12 +484,13 @@ class DeviceTest(test.TestCase):
     self.assertTrue(len(logical_gpus), len(gpus) + 1)
     for i in range(0, len(logical_gpus)):
       with ops.device('/device:GPU:' + str(i)):
-        a = constant_op.constant(1.0)
+        a = array_ops.identity(1.0)
         self.evaluate(a)
 
-    with self.assertRaisesRegexp(RuntimeError, 'unknown device'):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'Could not satisfy'):
       with ops.device('/device:GPU:' + str(len(logical_gpus))):
-        a = constant_op.constant(1.0)
+        a = array_ops.identity(1.0)
         self.evaluate(a)
 
     # Modifying the GPU configuration is not supported
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 4d9aa29ad60..9736bb8b78b 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -224,6 +224,10 @@ def constant(value, dtype=None, shape=None, name="Const"):
   ...
   NotImplementedError: ...
 
+  `tf.constant` will _always_ create CPU (host) tensors. In order to create
+  tensors on other devices, use `tf.identity`. (If the `value` is an eager
+  Tensor, however, the tensor will be returned unmodified as mentioned above.)
+
   Related Ops:
 
   * `tf.convert_to_tensor` is similar but:
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index f716dfa33dd..7af4bba606b 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3171,6 +3171,8 @@ class Graph(object):
     Raises:
       ValueError: if another function is defined with the same name.
     """
+    self._check_not_finalized()
+
     name = function.name
     # Sanity checks on gradient definition.
     if (function.grad_func_name is not None) and (function.python_grad_func is
@@ -3455,6 +3457,8 @@ class Graph(object):
     Returns:
       A list of the new `Operation` objects.
     """
+    self._check_not_finalized()
+
     # Create all Operation objects before accessing their inputs since an op may
     # be created before its inputs.
     new_ops = [
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 7c89061ab2c..19c1b646103 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -141,6 +141,7 @@ py_library(
         "//tensorflow/python/keras/distribute:multi_worker_training_state",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/tools/docs:doc_controls",
     ],
 )
@@ -153,8 +154,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
-        "//tensorflow/python/eager:profiler",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/profiler:profiler_v2",
     ],
 )
 
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index 224e8c84496..128553f0d39 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -436,9 +436,34 @@ def _depthwise_conv_block(inputs,
 
 @keras_export('keras.applications.mobilenet.preprocess_input')
 def preprocess_input(x, data_format=None):
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments
+    x: A 4D numpy array consists of RGB values within [0, 255].
+
+  Returns
+    Preprocessed array.
+
+  Raises
+    ValueError: In case of unknown `data_format` argument.
+  """
   return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.mobilenet.decode_predictions')
 def decode_predictions(preds, top=5):
+  """Decodes the prediction result from the model.
+
+  Arguments
+    preds: Numpy tensor encoding a batch of predictions.
+    top: Integer, how many top-guesses to return.
+
+  Returns
+    A list of lists of top class prediction tuples
+    `(class_name, class_description, score)`.
+    One list of tuples per sample in batch input.
+
+  Raises
+    ValueError: In case of invalid shape of the `preds` array (must be 2D).
+  """
   return imagenet_utils.decode_predictions(preds, top=top)
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index a983f6d7e46..b7eafe5496d 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -491,11 +491,31 @@ def _make_divisible(v, divisor, min_value=None):
 
 @keras_export('keras.applications.mobilenet_v2.preprocess_input')
 def preprocess_input(x, data_format=None):
-  """Preprocesses the input (encoding a batch of images) for the model."""
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments
+    x: A 4D numpy array consists of RGB values within [0, 255].
+
+  Returns
+    Preprocessed array.
+  """
   return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.mobilenet_v2.decode_predictions')
 def decode_predictions(preds, top=5):
-  """Decodes the prediction result from the model."""
+  """Decodes the prediction result from the model.
+
+  Arguments
+    preds: Numpy tensor encoding a batch of predictions.
+    top: Integer, how many top-guesses to return.
+
+  Returns
+    A list of lists of top class prediction tuples
+    `(class_name, class_description, score)`.
+    One list of tuples per sample in batch input.
+
+  Raises
+    ValueError: In case of invalid shape of the `pred` array (must be 2D).
+  """
   return imagenet_utils.decode_predictions(preds, top=top)
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 50856e1f173..05f97256d76 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -257,9 +257,29 @@ def reset_uids():
 
 @keras_export('keras.backend.clear_session')
 def clear_session():
-  """Destroys the current TF graph and creates a new one.
+  """Destroys the current TF graph and session, and creates a new one.
 
-  Useful to avoid clutter from old models / layers.
+  Calling clear_session() releases the global graph state that Keras is
+  holding on to; resets the counters used for naming layers and
+  variables in Keras; and resets the learning phase. This helps avoid clutter
+  from old models and layers, especially when memory is limited, and a
+  common use-case for clear_session is releasing memory when building models
+  and layers in a loop.
+
+  >>> import tensorflow as tf
+  >>> layers = [tf.keras.layers.Dense(10) for _ in range(10)]
+  >>> new_layer = tf.keras.layers.Dense(10)
+  >>> print(new_layer.name)
+  dense_10
+  >>> tf.keras.backend.set_learning_phase(1)
+  >>> print(tf.keras.backend.learning_phase())
+  1
+  >>> tf.keras.backend.clear_session()
+  >>> new_layer = tf.keras.layers.Dense(10)
+  >>> print(new_layer.name)
+  dense
+  >>> print(tf.keras.backend.learning_phase())
+  0
   """
   global _SESSION
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
@@ -2080,16 +2100,22 @@ def var(x, axis=None, keepdims=False):
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
+  It is an alias to `tf.math.reduce_std`.
+
   Arguments:
-      x: A tensor or variable.
-      axis: An integer, the axis to compute the standard deviation.
+      x: A tensor or variable. It should have numerical dtypes. Boolean type
+        inputs will be converted to float.
+      axis: An integer, the axis to compute the standard deviation. If `None`
+        (the default), reduces all dimensions. Must be in the range
+        `[-rank(x), rank(x))`.
       keepdims: A boolean, whether to keep the dimensions or not.
           If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1. If `keepdims` is `True`,
-          the reduced dimension is retained with length 1.
+          by 1. If `keepdims` is `True`, the reduced dimension is retained with
+          length 1.
 
   Returns:
-      A tensor with the standard deviation of elements of `x`.
+      A tensor with the standard deviation of elements of `x` with same dtype.
+      Boolean type input will be converted to float.
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
@@ -5638,16 +5664,21 @@ def bias_add(x, bias, data_format=None):
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
+  It is an alias to `tf.random.normal`.
+
   Arguments:
       shape: A tuple of integers, the shape of tensor to create.
-      mean: A float, mean of the normal distribution to draw samples.
-      stddev: A float, standard deviation of the normal distribution
-          to draw samples.
-      dtype: String, dtype of returned tensor.
-      seed: Integer, random seed.
+      mean: A float, the mean value of the normal distribution to draw samples.
+        Default to 0.0.
+      stddev: A float, the standard deviation of the normal distribution
+        to draw samples. Default to 1.0.
+      dtype: `tf.dtypes.DType`, dtype of returned tensor. Default to use Keras
+        backend dtype which is float32.
+      seed: Integer, random seed. Will use a random numpy integer when not
+        specified.
 
   Returns:
-      A tensor.
+      A tensor with normal distribution of values.
   """
   if dtype is None:
     dtype = floatx()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 5fae5eb9218..8651cf27375 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import profiler_v2 as profiler
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
@@ -1575,11 +1576,25 @@ class TensorBoard(Callback):
   You can find more information about TensorBoard
   [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
-  Example:
+  Example (Basic):
   ```python
   tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
   model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-  #run the tensorboard command to view the visualizations
+  # run the tensorboard command to view the visualizations.
+  ```
+  Example (Profile):
+  ```python
+  # profile a single batch, e.g. the 5th batch.
+  tensorboard_callback =
+      tf.keras.callbacks.TensorBoard(log_dir='./logs', profile_batch=5)
+  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+  # run the tensorboard command to view the visualizations in profile plugin.
+
+  # profile a range of batches, e.g. from 10 to 20.
+  tensorboard_callback =
+      tf.keras.callbacks.TensorBoard(log_dir='./logs', profile_batch='10,20')
+  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+  # run the tensorboard command to view the visualizations in profile plugin.
   ```
 
   Arguments:
@@ -1599,11 +1614,14 @@ class TensorBoard(Callback):
         callback will write the metrics and losses to TensorBoard every 1000
         batches. Note that writing too frequently to TensorBoard can slow down
         your training.
-      profile_batch: Profile the batch to sample compute characteristics. By
-        default, it will profile the second batch. Set profile_batch=0 to
-        disable profiling. Must run in TensorFlow eager mode.
-      embeddings_freq: frequency (in epochs) at which embedding layers will
-        be visualized. If set to 0, embeddings won't be visualized.
+      profile_batch: Profile the batch(es) to sample compute characteristics.
+        profile_batch must be a non-negative integer or a comma separated string
+        of pair of positive integers. A pair of positive integers signify a
+        range of batches to profile. By default, it will profile the second
+        batch. Set profile_batch=0 to disable profiling. Must run in TensorFlow
+        eager mode.
+      embeddings_freq: frequency (in epochs) at which embedding layers will be
+        visualized. If set to 0, embeddings won't be visualized.
       embeddings_metadata: a dictionary which maps layer name to a file name in
         which metadata for this embedding layer is saved. See the
         [details](
@@ -1652,8 +1670,8 @@ class TensorBoard(Callback):
     self._train_run_name = 'train'
     self._validation_run_name = 'validation'
     self._writers = {}
-
-    self._profile_batch = profile_batch
+    self._start_batch, self._stop_batch = self._init_profile_batch(
+        profile_batch)
     # True when a trace is running.
     self._is_tracing = False
 
@@ -1827,10 +1845,49 @@ class TensorBoard(Callback):
     else:
       self._total_batches_seen[writer_name] += 1
 
+  def _init_profile_batch(self, profile_batch):
+    """Validate profile_batch value and set the range of batches to profile.
+
+    Arguments:
+      profile_batch: The range of batches to profile. Should be a non-negative
+        integer or a comma separated string of pair of positive integers. A pair
+        of positive integers signify a range of batches to profile.
+
+    Returns:
+      A pair of non-negative integers specifying the start and stop batch to
+      profile.
+
+    Raises:
+      ValueError: If profile_batch is not an integer or a comma seperated pair
+                  of positive integers.
+
+    """
+    profile_batch_error_message = (
+        'profile_batch must be a non-negative integer or a comma separated '
+        'string of pair of positive integers. A pair of positive integers '
+        'signify a range of batches to profile.')
+    try:
+      profile_range = [int(i) for i in str(profile_batch).split(',')]
+    except ValueError:
+      raise ValueError(profile_batch_error_message)
+    if len(profile_range) == 1:  # single batch
+      start_batch, stop_batch = profile_range[0], profile_range[0]
+      if start_batch < 0:
+        raise ValueError(profile_batch_error_message)
+    elif len(profile_range) == 2:  # (start_batch, stop_batch)
+      start_batch, stop_batch = profile_range
+      # [0, 0], [-1, 100], [6, 5] are illegal.
+      if start_batch <= 0 or start_batch > stop_batch:
+        raise ValueError(profile_batch_error_message)
+    else:
+      raise ValueError(profile_batch_error_message)
+    return start_batch, stop_batch
+
   def on_train_begin(self, logs=None):
     self._init_batch_steps()
-    if self._profile_batch == 1:
-      summary_ops_v2.trace_on(graph=True, profiler=True)
+    if self._start_batch == 1:
+      summary_ops_v2.trace_on(graph=True, profiler=False)
+      profiler.start(logdir=os.path.join(self._log_write_dir, 'train'))
       self._is_tracing = True
 
   def on_test_begin(self, logs=None):
@@ -1845,7 +1902,7 @@ class TensorBoard(Callback):
       batch: Integer, index of batch within the current epoch.
       logs: Dict. Metric results for this batch.
     """
-    if self.update_freq == 'epoch' and self._profile_batch is None:
+    if self.update_freq == 'epoch' and self._start_batch is None:
       return
 
     # Don't output batch_size and batch number as TensorBoard summaries
@@ -1857,10 +1914,11 @@ class TensorBoard(Callback):
     self._increment_step(self._train_run_name)
 
     if context.executing_eagerly():
-      if self._is_tracing:
+      if self._is_tracing and math_ops.greater_equal(train_batches,
+                                                     self._stop_batch):
         self._log_trace()
       elif (not self._is_tracing and
-            math_ops.equal(train_batches, self._profile_batch - 1)):
+            math_ops.equal(train_batches, self._start_batch - 1)):
         self._enable_trace()
 
   def on_test_batch_end(self, batch, logs=None):
@@ -1899,7 +1957,8 @@ class TensorBoard(Callback):
 
   def _enable_trace(self):
     if context.executing_eagerly():
-      summary_ops_v2.trace_on(graph=True, profiler=True)
+      summary_ops_v2.trace_on(graph=True, profiler=False)
+      profiler.start(logdir=os.path.join(self._log_write_dir, 'train'))
       self._is_tracing = True
 
   def _log_trace(self):
@@ -1909,10 +1968,8 @@ class TensorBoard(Callback):
           summary_ops_v2.always_record_summaries():
         # TODO(b/126388999): Remove step info in the summary name.
         step = K.get_value(self._total_batches_seen[self._train_run_name])
-        summary_ops_v2.trace_export(
-            name='batch_%d' % step,
-            step=step,
-            profiler_outdir=os.path.join(self._log_write_dir, 'train'))
+        summary_ops_v2.trace_export(name='batch_%d' % step, step=step)
+        profiler.stop()
       self._is_tracing = False
 
   def _log_metrics(self, logs, prefix, step):
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index bf6d8cda6f2..34f0138560c 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -1805,6 +1805,15 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
         experimental_run_tf_function=testing_utils.should_run_tf_function())
     return model
 
+  def _get_trace_file(self, logdir):
+    profile_dir = os.path.join(logdir, 'plugins', 'profile')
+    for (dirpath, dirnames, filenames) in os.walk(profile_dir):
+      del dirnames  # unused
+      for filename in filenames:
+        if filename.endswith('.trace'):
+          return os.path.join(dirpath, filename)
+    return None
+
   def fitModelAndAssertKerasModelWritten(self, model):
     x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
     tb_cbk = keras.callbacks.TensorBoard(self.logdir,
@@ -1873,6 +1882,7 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
             _ObservedSummary(logdir=self.train_dir, tag=u'batch_1'),
         },
     )
+    self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
 
   def test_TensorBoard_autoTrace_tagNameWithBatchNum(self):
     model = self._get_seq_model()
@@ -1895,6 +1905,78 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
             _ObservedSummary(logdir=self.train_dir, tag=u'batch_2'),
         },
     )
+    self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
+
+  def test_TensorBoard_autoTrace_profileBatchRangeSingle(self):
+    model = self._get_seq_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir, histogram_freq=1, profile_batch='2,2', write_graph=False)
+
+    model.fit(
+        x,
+        y,
+        batch_size=3,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+
+    self.assertEqual(
+        summary_file.tensors,
+        {
+            # Trace will be logged once at the batch it stops profiling.
+            _ObservedSummary(logdir=self.train_dir, tag=u'batch_2'),
+        },
+    )
+    self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
+
+  def test_TensorBoard_autoTrace_profileBatchRange(self):
+    model = self._get_seq_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(
+        self.logdir, histogram_freq=1, profile_batch='1,3', write_graph=False)
+
+    model.fit(
+        x,
+        y,
+        batch_size=4,
+        epochs=2,
+        validation_data=(x, y),
+        callbacks=[tb_cbk])
+    summary_file = list_summaries(self.logdir)
+
+    self.assertEqual(
+        summary_file.tensors,
+        {
+            # Trace will be logged once at the batch it stops profiling.
+            _ObservedSummary(logdir=self.train_dir, tag=u'batch_3'),
+        },
+    )
+    self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
+
+  def test_TensorBoard_autoTrace_profileInvalidBatchRange(self):
+    with self.assertRaises(ValueError):
+      keras.callbacks.TensorBoard(
+          self.logdir,
+          histogram_freq=1,
+          profile_batch='-1,3',
+          write_graph=False)
+
+    with self.assertRaises(ValueError):
+      keras.callbacks.TensorBoard(
+          self.logdir,
+          histogram_freq=1,
+          profile_batch='1,None',
+          write_graph=False)
+
+    with self.assertRaises(ValueError):
+      keras.callbacks.TensorBoard(
+          self.logdir, histogram_freq=1, profile_batch='6,5', write_graph=False)
+
+    with self.assertRaises(ValueError):
+      keras.callbacks.TensorBoard(
+          self.logdir, histogram_freq=1, profile_batch=-1, write_graph=False)
 
   def test_TensorBoard_autoTrace_profile_batch_largerThanBatchCount(self):
     model = self._get_seq_model()
@@ -1913,6 +1995,7 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
 
     # Enabled trace only on the 10000th batch, thus it should be empty.
     self.assertEmpty(summary_file.tensors)
+    self.assertIsNone(self._get_trace_file(logdir=self.train_dir))
 
 
 class MostRecentlyModifiedFileMatchingPatternTest(test.TestCase):
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
index db0d2b9f4b5..524e039f597 100644
--- a/tensorflow/python/keras/callbacks_v1.py
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -24,7 +24,6 @@ import os
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import profiler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
@@ -33,6 +32,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import profiler_v2 as profiler
 from tensorflow.python.summary import summary as tf_summary
 from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import keras_export
@@ -359,16 +359,16 @@ class TensorBoard(callbacks.Callback):
       self._samples_seen_at_last_write = self._samples_seen
     self._total_batches_seen += 1
     if self._is_profiling:
-      profiler.save(self.log_dir, profiler.stop())
+      profiler.stop()
       self._is_profiling = False
     elif (not self._is_profiling and
           self._total_batches_seen == self._profile_batch - 1):
-      profiler.start()
+      profiler.start(self.log_dir)
       self._is_profiling = True
 
   def on_train_begin(self, logs=None):
     if self._profile_batch == 1:
-      profiler.start()
+      profiler.start(self.log_dir)
       self._is_profiling = True
 
   def on_epoch_begin(self, epoch, logs=None):
@@ -452,6 +452,6 @@ class TensorBoard(callbacks.Callback):
 
   def on_train_end(self, logs=None):
     if self._is_profiling:
-      profiler.save(self.log_dir, profiler.stop())
+      profiler.stop()
       self._is_profiling = False
     self.writer.close()
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 81609d7092c..1793383bbe3 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -42,11 +42,13 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
+from tensorflow.python.util import nest
 
 _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
@@ -543,6 +545,25 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertIsNotNone(grad_v1)
       self.assertIsNotNone(grad_v2)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[strategy_combinations.one_device_strategy] +
+          tpu_strategies,
+          mode=['graph', 'eager']))
+  def test_optimizer_in_cross_replica_context_raises_error(self, distribution):
+
+    with self.cached_session(), distribution.scope():
+      model = keras.models.Sequential([keras.layers.Dense(1)])
+      x = np.array([[1.]])
+      with backprop.GradientTape() as tape:
+        y = model(x)
+      gradients = tape.gradient(y, model.trainable_variables)
+      optimizer = gradient_descent_keras.SGD()
+
+      with self.assertRaisesRegex(RuntimeError,
+                                  'cannot be called in cross-replica context'):
+        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+
   @combinations.generate(all_strategy_combinations_plus_run_distributed())
   def test_calling_model_with_nested_numpy_arrays(self, distribution,
                                                   experimental_run_tf_function):
@@ -2153,6 +2174,91 @@ class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
         results['sparse_categorical_crossentropy'], results['loss'], 1e-6)
 
 
+class DeterministicModel(keras.Model):
+  """Deterministic Model that always outputs the same initial result.
+
+  It verifies the `call` method is run inside the same distribution
+  strategy that the model was initially passed.
+  """
+
+  def __init__(self, strategy):
+    super(DeterministicModel, self).__init__()
+    self.x = None
+    self.strategy = strategy
+
+  def build(self, input_shape):
+    self.x = variables.Variable(array_ops.ones(shape=()))
+
+  def call(self, inputs, training=None, mask=None):
+    active_strategy = distribution_strategy_context.get_strategy()
+    if active_strategy is not self.strategy:
+      raise ValueError('Model must execute call w/ the original strategy')
+    return self.x * inputs
+
+
+class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
+  """Tests that model creation captures the strategy."""
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies,
+          mode=['eager']))
+  def test_fit_and_evaluate(self, distribution):
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(
+        (array_ops.ones(shape=(64,)), array_ops.ones(shape=(64,))))
+    dataset = dataset.batch(8 * distribution.num_replicas_in_sync)
+    # Make model with distribution strategy
+    with distribution.scope():
+      model = DeterministicModel(distribution)
+
+    # Compile & evaluate the model outside of the distribution strategy scope
+    model.compile(
+        optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+        loss=keras.losses.MeanSquaredError(),
+        metrics=['binary_accuracy'])
+
+    # Non-eager training doesn't support steps_per_epoch=None.
+    for unused_epoch in range(2):
+      model.fit(dataset)
+
+    results = model.evaluate(dataset)
+    results = dict(zip(model.metrics_names, results))
+
+    # Check that the metrics have a result we expect
+    self.assertEqual(results['binary_accuracy'], 1.0)
+    self.assertAllClose(results['loss'], 0.0)
+
+    # Assert that all metric/optimizer/model variables were made in the
+    # distribution strategy (Test that compile uses the captured
+    # distribution strategy)
+    metric_vars = nest.flatten(
+        [metric.variables for metric in model.metrics])
+    for var in metric_vars:
+      self.assertTrue(distribution.extended.variable_created_in_scope(var))
+    for var in model.optimizer._weights:
+      self.assertTrue(distribution.extended.variable_created_in_scope(var))
+    for var in model.variables:
+      self.assertTrue(distribution.extended.variable_created_in_scope(var))
+
+    # Make sure the metric must be created in the same scope as the model:
+    # This shouldn't raise any validation errors
+    with distribution.scope():
+      metric = keras.metrics.BinaryAccuracy()
+    model.compile(
+        optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+        loss=keras.losses.MeanSquaredError(),
+        metrics=[metric])
+
+    # This should raise an error because the metric is constructed
+    # outside of the scope, and not by compile
+    if distribution_strategy_context.has_strategy():
+      with self.assertRaisesRegexp(
+          ValueError, 'All metrics must be created in'):
+        model.compile(
+            optimizer=keras.optimizers.adam_v2.Adam(1e-4),
+            loss=keras.losses.MeanSquaredError(),
+            metrics=[keras.metrics.BinaryAccuracy()])
+
 if __name__ == '__main__':
   base_layer_utils.enable_v2_dtype_behavior()
   test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 920624f83fd..63fe86adcad 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -109,6 +109,32 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, file_format))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_model_checkpoint_works_with_same_file_path(self, mode):
+
+    def proc_model_checkpoint_works_with_same_file_path(
+        test_obj, saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 2
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
+
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
+
+    multi_process_runner.run(
+        proc_model_checkpoint_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=2),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
@@ -174,6 +200,31 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self,))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_tensorboard_works_with_same_file_path(self, mode):
+
+    def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 2
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
+
+      test_obj.assertTrue(file_io.list_directory(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'logfile')
+
+    multi_process_runner.run(
+        proc_tensorboard_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=2),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_early_stopping(self, mode):
 
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index c097398d90d..0e09bb291c5 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -93,13 +93,12 @@ _keras_model_gauge = monitoring.BoolGauge(
 
 @keras_export('keras.layers.Layer')
 class Layer(module.Module, version_utils.LayerVersionSelector):
-  """Base layer class.
+  """This is the class from which all layers inherit.
 
-  This is the class from which all layers inherit.
-
-  A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing weights,
-  losses, updates, and inter-layer connectivity.
+  A layer is a callable object that takes as input one or more tensors and
+  that outputs one or more tensors. It involves *computation*, defined
+  in the `call()` method, and a *state* (weight variables), defined
+  either in the constructor `__init__()` or in the `build()` method.
 
   Users will just instantiate a layer and then treat it as a callable.
 
@@ -125,6 +124,103 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     This method is used when saving
     the layer or a model that contains this layer.
 
+  Examples:
+
+  Here's a basic example: a layer with two variables, `w` and `b`,
+  that returns `y = w . x + b`.
+  It shows how to implement `build()` and `call()`.
+  Variables set as attributes of a layer are tracked as weights
+  of the layers (in `layer.weights`).
+
+  ```python
+  class SimpleDense(Layer):
+
+    def __init__(self, units=32):
+        super(SimpleDense, self).__init__()
+        self.units = units
+
+    def build(self, input_shape):  # Create the state of the layer (weights)
+      w_init = tf.random_normal_initializer()
+      self.w = tf.Variable(
+          initial_value=w_init(shape=(input_shape[-1], self.units),
+                               dtype='float32'),
+          trainable=True)
+      b_init = tf.zeros_initializer()
+      self.b = tf.Variable(
+          initial_value=b_init(shape=(self.units,), dtype='float32'),
+          trainable=True)
+
+    def call(self, inputs):  # Defines the computation from inputs to outputs
+        return tf.matmul(inputs, self.w) + self.b
+
+  # Instantiates the layer.
+  linear_layer = SimpleDense(4)
+
+  # This will also call `build(input_shape)` and create the weights.
+  y = linear_layer(tf.ones((2, 2)))
+  assert len(linear_layer.weights) == 2
+
+  # These weights are trainable, so they're listed in `trainable_weights`:
+  assert len(linear_layer.trainable_weights) == 2
+  ```
+
+  Note that the method `add_weight()` offers a shortcut to create weights:
+
+  ```python
+  class SimpleDense(Layer):
+
+    def __init__(self, units=32):
+        super(SimpleDense, self).__init__()
+        self.units = units
+
+    def build(self, input_shape):
+        self.w = self.add_weight(shape=(input_shape[-1], self.units),
+                                 initializer='random_normal',
+                                 trainable=True)
+        self.b = self.add_weight(shape=(self.units,),
+                                 initializer='random_normal',
+                                 trainable=True)
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self.w) + self.b
+  ```
+
+  Besides trainable weights, updated via backpropagation during training,
+  layers can also have non-trainable weights. These weights are meant to
+  be updated manually during `call()`. Here's a example layer that computes
+  the running sum of its inputs:
+
+  ```python
+  class ComputeSum(Layer):
+
+    def __init__(self, input_dim):
+        super(ComputeSum, self).__init__()
+        # Create a non-trainable weight.
+        self.total = tf.Variable(initial_value=tf.zeros((input_dim,)),
+                                 trainable=False)
+
+    def call(self, inputs):
+        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
+        return self.total
+
+  my_sum = ComputeSum(2)
+  x = tf.ones((2, 2))
+
+  y = my_sum(x)
+  print(y.numpy())  # [2. 2.]
+
+  y = my_sum(x)
+  print(y.numpy())  # [4. 4.]
+
+  assert my_sum.weights == [my_sum.total]
+  assert my_sum.non_trainable_weights == [my_sum.total]
+  assert my_sum.trainable_weights == []
+  ```
+
+  For more information about creating layers, see the guide
+  [Writing custom layers and models with Keras](
+    https://www.tensorflow.org/guide/keras/custom_layers_and_models)
+
   Arguments:
     trainable: Boolean, whether the layer's variables should be trainable.
     name: String name of the layer.
@@ -279,7 +375,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Manage initial weight values if passed.
     self._initial_weights = kwargs.get('weights', None)
 
+    # Whether the layer will track any layers that is set as attribute on itself
+    # as sub-layers, the weights from the sub-layers will be included in the
+    # parent layer's variables() as well.
+    # Default to True, which means auto tracking is turned on. Certain subclass
+    # might want to turn it off, like Sequential model.
+    self._auto_track_sub_layers = True
+
   @trackable.no_automatic_dependency_tracking
+  @base_layer_utils.default
   def build(self, input_shape):
     """Creates the variables of the layer (optional, for subclass implementers).
 
@@ -870,21 +974,24 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def dtype(self):
+    """Dtype used by the weights of the layer, set in the constructor."""
     return self._dtype_policy.variable_dtype
 
   @property
   def name(self):
+    """Name of the layer (string), set in the constructor."""
     return self._name
 
   @property
   @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
+    """Whether the layer is dynamic (eager-only); set in the constructor."""
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
     return self._dynamic
 
   @property
-  @doc_controls.do_not_generate_docs
+  @doc_controls.do_not_doc_inheritable
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
     return self._stateful
@@ -916,6 +1023,37 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def input_spec(self):
+    """`InputSpec` instance(s) describing the input format for this layer.
+
+    When you create a layer subclass, you can set `self.input_spec` to enable
+    the layer to run input compatibility checks when it is called.
+    Consider a `Conv2D` layer: it can only be called on a single input tensor
+    of rank 4. As such, you can set, in `__init__()`:
+
+    ```python
+    self.input_spec = tf.keras.layers.InputSpec(ndim=4)
+    ```
+
+    Now, if you try to call the layer on an input that isn't rank 4
+    (for instance, an input of shape `(2,)`, it will raise a nicely-formatted
+    error:
+
+    ```
+    ValueError: Input 0 of layer conv2d is incompatible with the layer:
+    expected ndim=4, found ndim=1. Full shape received: [2]
+    ```
+
+    Input checks that can be specified via `input_spec` include:
+    - Structure (e.g. a single input, a list of 2 inputs, etc)
+    - Shape
+    - Rank (ndim)
+    - Dtype
+
+    For more information, see `tf.keras.layers.InputSpec`.
+
+    Returns:
+      A `tf.keras.layers.InputSpec` instance, or nested structure thereof.
+    """
     return self._input_spec
 
   @input_spec.setter
@@ -931,6 +1069,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def trainable_weights(self):
+    """List of all trainable weights tracked by this layer.
+
+    Trainable weights are updated via gradient descent during training.
+
+    Returns:
+      A list of trainable variables.
+    """
     if self.trainable:
       children_weights = self._gather_children_attribute('trainable_weights')
       return self._dedup_weights(self._trainable_weights + children_weights)
@@ -939,6 +1084,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def non_trainable_weights(self):
+    """List of all non-trainable weights tracked by this layer.
+
+    Non-trainable weights are *not* updated during training. They are expected
+    to be updated manually in `call()`.
+
+    Returns:
+      A list of non-trainable variables.
+    """
     if self.trainable:
       children_weights = self._gather_children_attribute(
           'non_trainable_weights')
@@ -960,6 +1113,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self.trainable_weights + self.non_trainable_weights
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def updates(self):
     collected_updates = []
     all_layers = self._gather_unique_layers()
@@ -1010,7 +1164,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           collected_losses.append(loss_tensor)
     return collected_losses
 
-  @doc_controls.for_subclass_implementers
   def add_loss(self, losses, inputs=None):
     """Add loss tensor(s), potentially dependent on layer inputs.
 
@@ -1145,6 +1298,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @property
   def metrics(self):
+    """List of `tf.keras.metrics.Metric` instances tracked by the layer."""
     collected_metrics = []
     all_layers = self._gather_unique_layers()
     for layer in all_layers:
@@ -1152,7 +1306,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         collected_metrics.extend(layer._metrics)
     return collected_metrics
 
-  @doc_controls.for_subclass_implementers
   def add_metric(self, value, aggregation=None, name=None):
     """Adds metric tensor to the layer.
 
@@ -1225,7 +1378,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @deprecation.deprecated_args(None, '`inputs` is now automatically inferred',
                                'inputs')
-  @doc_controls.for_subclass_implementers
+  @doc_controls.do_not_doc_inheritable
   def add_update(self, updates, inputs=None):
     """Add update op(s), potentially dependent on layer inputs.
 
@@ -1433,6 +1586,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         output_weights.append(weight)
     return backend.batch_get_value(output_weights)
 
+  @doc_controls.do_not_generate_docs
   def get_updates_for(self, inputs):
     """Retrieves updates relevant to a specific set of inputs.
 
@@ -1452,6 +1606,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     reachable = tf_utils.get_reachable_from_inputs(inputs, updates)
     return [u for u in updates if u in reachable]
 
+  @doc_controls.do_not_doc_inheritable
   def get_losses_for(self, inputs):
     """Retrieves losses relevant to a specific set of inputs.
 
@@ -1471,6 +1626,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     reachable = tf_utils.get_reachable_from_inputs(inputs, losses)
     return [l for l in losses if l in reachable]
 
+  @doc_controls.do_not_doc_inheritable
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
 
@@ -1490,6 +1646,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     else:
       return getattr(inputs, '_keras_mask', None)
 
+  @doc_controls.do_not_doc_inheritable
   def get_output_mask_at(self, node_index):
     """Retrieves the output mask tensor(s) of a layer at a given node.
 
@@ -1510,6 +1667,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       return getattr(output, '_keras_mask', None)
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def input_mask(self):
     """Retrieves the input mask tensor(s) of a layer.
 
@@ -1531,6 +1689,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       return getattr(inputs, '_keras_mask', None)
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def output_mask(self):
     """Retrieves the output mask tensor(s) of a layer.
 
@@ -1551,6 +1710,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     else:
       return getattr(output, '_keras_mask', None)
 
+  @doc_controls.do_not_doc_inheritable
   def get_input_shape_at(self, node_index):
     """Retrieves the input shape(s) of a layer at a given node.
 
@@ -1570,6 +1730,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self._get_node_attribute_at_index(node_index, 'input_shapes',
                                              'input shape')
 
+  @doc_controls.do_not_doc_inheritable
   def get_output_shape_at(self, node_index):
     """Retrieves the output shape(s) of a layer at a given node.
 
@@ -1589,6 +1750,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self._get_node_attribute_at_index(node_index, 'output_shapes',
                                              'output shape')
 
+  @doc_controls.do_not_doc_inheritable
   def get_input_at(self, node_index):
     """Retrieves the input tensor(s) of a layer at a given node.
 
@@ -1607,6 +1769,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self._get_node_attribute_at_index(node_index, 'input_tensors',
                                              'input')
 
+  @doc_controls.do_not_doc_inheritable
   def get_output_at(self, node_index):
     """Retrieves the output tensor(s) of a layer at a given node.
 
@@ -1664,6 +1827,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def input_shape(self):
     """Retrieves the input shape(s) of a layer.
 
@@ -1717,6 +1881,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return layer_utils.count_params(self.weights)
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def output_shape(self):
     """Retrieves the output shape(s) of a layer.
 
@@ -1789,6 +1954,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self.add_weight(*args, **kwargs)
 
   @property
+  @doc_controls.do_not_generate_docs
   def variables(self):
     """Returns the list of all layer variables/weights.
 
@@ -1800,10 +1966,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self.weights
 
   @property
+  @doc_controls.do_not_generate_docs
   def trainable_variables(self):
     return self.trainable_weights
 
   @property
+  @doc_controls.do_not_generate_docs
   def non_trainable_variables(self):
     return self.non_trainable_weights
 
@@ -2390,10 +2558,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
-
-    # Sequential models use a separate layer tracking mechanism, so skip the
-    # logic defined here for tracking layers.
-    if (self.__class__.__name__ != 'Sequential' and
+    if (getattr(self, '_auto_track_sub_layers', True) and
         (isinstance(value, Layer) or trackable_layer_utils.has_weights(value))):
       self._maybe_create_attribute('_layers', [])
       # We need to check object identity to avoid de-duplicating empty
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 86b0689d026..41d8223a44b 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -900,6 +900,33 @@ class NestedTrackingTest(test.TestCase):
     self.assertEqual(defun_layer._call_fn_args,
                      ['x', 'mask', 'a', 'training', 'b'])
 
+  def test_sequential_model(self):
+    model = keras.Sequential([keras.layers.Dense(10, input_shape=(10,)),
+                              keras.layers.Dense(5)])
+    self.assertLen(model.layers, 2)
+    self.assertLen(model.weights, 4)
+
+    # Make sure a subclass model also works when it is called 'Sequential'.
+    class Sequential(keras.Model):
+
+      def __init__(self):
+        super(Sequential, self).__init__()
+        self.dense_layers = [keras.layers.Dense(10),
+                             keras.layers.Dense(5)]
+
+      def call(self, inputs):
+        x = inputs
+        for d in self.dense_layers:
+          x = d(x)
+        return x
+
+    s = Sequential()
+    self.assertLen(s.layers, 2)
+    self.assertLen(s.weights, 0)
+
+    s(keras.Input((10,)))
+    self.assertLen(s.weights, 4)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class NameScopingTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 2dc3a71aac3..60ee17d76d5 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -178,6 +178,7 @@ class Layer(base_layer.Layer):
     # Indicates whether `build` needs to be called upon layer call, to create
     # the layer's weights.
     self.built = False
+    self._build_input_shape = None
     # Provides information about which inputs are compatible with the layer.
     self._input_spec = None
     self.supports_masking = False
@@ -245,6 +246,15 @@ class Layer(base_layer.Layer):
     # Manage initial weight values if passed.
     self._initial_weights = kwargs.get('weights', None)
 
+    # Whether the layer will track any layers that is set as attribute on itself
+    # as sub-layers, the weights from the sub-layers will be included in the
+    # parent layer's variables() as well.
+    # Default to True, which means auto tracking is turned on. Certain subclass
+    # might want to turn it off, like Sequential model.
+    self._auto_track_sub_layers = True
+
+  @trackable.no_automatic_dependency_tracking
+  @base_layer_utils.default
   def build(self, input_shape):
     """Creates the variables of the layer (optional, for subclass implementers).
 
@@ -259,6 +269,8 @@ class Layer(base_layer.Layer):
         `TensorShape` if the layer expects a list of inputs
         (one instance per input).
     """
+    if not hasattr(self.build, '_is_default'):
+      self._build_input_shape = input_shape
     self.built = True
 
   @doc_controls.for_subclass_implementers
@@ -2251,10 +2263,7 @@ class Layer(base_layer.Layer):
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
-
-    # Sequential models use a separate layer tracking mechanism, so skip the
-    # logic defined here for tracking layers.
-    if (self.__class__.__name__ != 'Sequential' and
+    if (getattr(self, '_auto_track_sub_layers', True) and
         (isinstance(value, Layer) or trackable_layer_utils.has_weights(value))):
       self._maybe_create_attribute('_layers', [])
       # We need to check object identity to avoid de-duplicating empty
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 74c6370fce6..85ea00dcffe 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -172,14 +172,19 @@ class LossesContainer(object):
       loss_metric_values.append(loss_metric_value)
 
     if regularization_losses:
+      regularization_losses = losses_utils.cast_losses_to_common_dtype(
+          regularization_losses)
       reg_loss = math_ops.add_n(regularization_losses)
       loss_metric_values.append(reg_loss)
       loss_values.append(losses_utils.scale_loss_for_distribution(reg_loss))
 
     if loss_values:
+      loss_metric_values = losses_utils.cast_losses_to_common_dtype(
+          loss_metric_values)
       total_loss_metric_value = math_ops.add_n(loss_metric_values)
       self._loss_metric.update_state(total_loss_metric_value)
 
+      loss_values = losses_utils.cast_losses_to_common_dtype(loss_values)
       total_loss = math_ops.add_n(loss_values)
       return total_loss
     else:
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
index d790af4798f..5cb38805569 100644
--- a/tensorflow/python/keras/engine/input_spec.py
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -32,10 +32,13 @@ from tensorflow.python.util.tf_export import tf_export
 @keras_export('keras.layers.InputSpec')
 @tf_export(v1=['layers.InputSpec'])
 class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
+  """Specifies the rank, dtype and shape of every input to a layer.
 
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
+  Layers can expose (if appropriate) an `input_spec` attribute:
+  an instance of `InputSpec`, or a nested structure of `InputSpec` instances
+  (one per input tensor). These objects enable the layer to run input
+  compatibility checks for input structure, input rank, input shape, and
+  input dtype.
 
   A None entry in a shape is compatible with any dimension,
   a None shape is compatible with any shape.
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 166553a324b..e13ab8f0b92 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -158,7 +158,8 @@ class Network(base_layer.Layer):
   # The key of _layer_call_argspecs is a layer. tf.Module._flatten will fail to
   # flatten the key since it is trying to convert Trackable/Layer to a string.
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
-      ('_layer_call_argspecs', '_compiled_trainable_state'),
+      ('_layer_call_argspecs', '_compiled_trainable_state',
+       '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
       base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES
   ))
 
@@ -720,10 +721,17 @@ class Network(base_layer.Layer):
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = generic_utils.object_list_uid(input_shape)
-    if cache_key in self._output_shape_cache:
-      # Cache hit. Return shapes as TensorShapes.
-      return self._output_shape_cache[cache_key]
+    # Use the tuple of TensorShape as the cache key, since tuple is hashable
+    # and can be used as hash key.
+    try:
+      cache_key = tuple(tf_utils.convert_shapes(input_shape, to_tuples=True))
+      if cache_key in self._output_shape_cache:
+        # Cache hit. Return shapes as TensorShapes.
+        return self._output_shape_cache[cache_key]
+    except ValueError:
+      # In case there are unknown TensorShape, eg for sparse tensor input,
+      # We skip the caching since the shape is unknown.
+      pass
 
     layers_to_output_shapes = {}
     for layer, shape in zip(self._input_layers, nest.flatten(input_shape)):
@@ -905,9 +913,14 @@ class Network(base_layer.Layer):
 
     if output_shapes is not None:
       input_shapes = [x.shape for x in inputs]
-      cache_key = generic_utils.object_list_uid(input_shapes)
-      self._output_shape_cache[cache_key] = nest.pack_sequence_as(
-          self._nested_outputs, output_shapes)
+      try:
+        cache_key = tuple(tf_utils.convert_shapes(input_shapes, to_tuples=True))
+        self._output_shape_cache[cache_key] = nest.pack_sequence_as(
+            self._nested_outputs, output_shapes)
+      except ValueError:
+        # In case there are unknown TensorShape, eg for sparse tensor input,
+        # We skip the caching since the shape is unknown.
+        pass
 
     output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
     return output_tensors
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index b3e19f2a6ea..17f08889936 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -1869,6 +1869,15 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     self.assertEqual(network.dynamic, False)
     self.assertEqual(network.stateful, False)
 
+  def test_compute_output_shape_cache(self):
+    # See https://github.com/tensorflow/tensorflow/issues/32029.
+    x = input_layer_lib.Input(shape=(None, 32))
+    dense = keras.layers.Dense(2)
+    y = dense(x)
+    network = network_lib.Network(x, y, name='dense_network')
+
+    for i in range(999, 1024):
+      self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 4ae06bc46e1..447fc22c5f0 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -116,6 +116,7 @@ class Sequential(training.Model):
     super(Sequential, self).__init__(name=name, autocast=False)
     self.supports_masking = True
     self._compute_output_and_mask_jointly = True
+    self._auto_track_sub_layers = False
 
     self._layer_call_argspecs = {}
 
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 7e86d9e2d8b..f9ec6f37b45 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.profiler import traceme
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
@@ -308,20 +309,21 @@ class Model(network.Network, version_utils.ModelVersionSelector):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     _keras_api_gauge.get_cell('compile').set(True)
-    self._validate_compile(optimizer, **kwargs)
-    self._run_eagerly = kwargs.pop('run_eagerly', None)
+    with self.distribute_strategy.scope():
+      self._validate_compile(optimizer, metrics, **kwargs)
+      self._run_eagerly = kwargs.pop('run_eagerly', None)
 
-    self.optimizer = self._get_optimizer(optimizer)
-    self.compiled_loss = compile_utils.LossesContainer(
-        loss, loss_weights, output_names=self.output_names)
-    self.compiled_metrics = compile_utils.MetricsContainer(
-        metrics, weighted_metrics, output_names=self.output_names)
+      self.optimizer = self._get_optimizer(optimizer)
+      self.compiled_loss = compile_utils.LossesContainer(
+          loss, loss_weights, output_names=self.output_names)
+      self.compiled_metrics = compile_utils.MetricsContainer(
+          metrics, weighted_metrics, output_names=self.output_names)
 
-    # Initializes attrs that are reset each time `compile` is called.
-    self._reset_compile_cache()
-    self._is_compiled = True
+      # Initializes attrs that are reset each time `compile` is called.
+      self._reset_compile_cache()
+      self._is_compiled = True
 
-    self.loss = loss or {}  # Backwards compat.
+      self.loss = loss or {}  # Backwards compat.
 
   def _get_optimizer(self, optimizer):
     """Wraps `optimizer` in `LossScaleOptimizer` if necessary."""
@@ -759,9 +761,15 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         callbacks.on_epoch_begin(epoch)
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
-            callbacks.on_train_batch_begin(step)
-            logs = train_function(iterator)
-            callbacks.on_train_batch_end(step, logs)
+            with traceme.TraceMe(
+                'TraceContext',
+                graph_type='train',
+                epoch_num=epoch,
+                step_num=step,
+                batch_size=batch_size):
+              callbacks.on_train_batch_begin(step)
+              logs = train_function(iterator)
+              callbacks.on_train_batch_end(step, logs)
         epoch_logs = {m.name: m.result() for m in self.metrics}
 
         # Run validation.
@@ -982,15 +990,20 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         self.reset_metrics()
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
-            callbacks.on_test_batch_begin(step)
-            logs = test_function(iterator)
-            callbacks.on_test_batch_end(step, logs)
+            with traceme.TraceMe(
+                'TraceContext',
+                graph_type='test',
+                step_num=step):
+              callbacks.on_test_batch_begin(step)
+              logs = test_function(iterator)
+              callbacks.on_test_batch_end(step, logs)
       callbacks.on_test_end()
 
+      logs = to_numpy(logs)
       if return_dict:
-        return {m.name: m.result().numpy() for m in self.metrics}
+        return logs
       else:
-        results = [m.result().numpy() for m in self.metrics]
+        results = [logs.get(name, None) for name in self.metrics_names]
         if len(results) == 1:
           return results[0]
         return results
@@ -1186,7 +1199,8 @@ class Model(network.Network, version_utils.ModelVersionSelector):
                      y=None,
                      sample_weight=None,
                      class_weight=None,
-                     reset_metrics=True):
+                     reset_metrics=True,
+                     return_dict=False):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
@@ -1213,6 +1227,9 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         reset_metrics: If `True`, the metrics returned will be only for this
           batch. If `False`, the metrics will be statefully accumulated across
           batches.
+        return_dict: If `True`, loss and metric results are returned as a dict,
+          with each key being the name of the metric. If `False`, they are
+          returned as a list.
 
     Returns:
         Scalar training loss
@@ -1232,15 +1249,25 @@ class Model(network.Network, version_utils.ModelVersionSelector):
                                                     y, sample_weight,
                                                     class_weight)
       train_function = self._make_train_function()
-      train_function(iterator)
-    metrics = [m.result().numpy() for m in self.metrics]
+      logs = train_function(iterator)
+
     if reset_metrics:
       self.reset_metrics()
-    if len(metrics) == 1:
-      return metrics[0]
-    return metrics
+    logs = to_numpy(logs)
+    if return_dict:
+      return logs
+    else:
+      results = [logs.get(name, None) for name in self.metrics_names]
+      if len(results) == 1:
+        return results[0]
+      return results
 
-  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
+  def test_on_batch(self,
+                    x,
+                    y=None,
+                    sample_weight=None,
+                    reset_metrics=True,
+                    return_dict=False):
     """Test the model on a single batch of samples.
 
     Arguments:
@@ -1261,6 +1288,9 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         reset_metrics: If `True`, the metrics returned will be only for this
           batch. If `False`, the metrics will be statefully accumulated across
           batches.
+        return_dict: If `True`, loss and metric results are returned as a dict,
+          with each key being the name of the metric. If `False`, they are
+          returned as a list.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1277,13 +1307,18 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
                                                     y, sample_weight)
       test_function = self._make_test_function()
-      test_function(iterator)
-    metrics = [m.result().numpy() for m in self.metrics]
+      logs = test_function(iterator)
+
     if reset_metrics:
       self.reset_metrics()
-    if len(metrics) == 1:
-      return metrics[0]
-    return metrics
+    logs = to_numpy(logs)
+    if return_dict:
+      return logs
+    else:
+      results = [logs.get(name, None) for name in self.metrics_names]
+      if len(results) == 1:
+        return results[0]
+      return results
 
   def predict_on_batch(self, x):
     """Returns predictions for a single batch of samples.
@@ -1420,7 +1455,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           'and the first argument in `call` as positional arguments, '
           'found: ' + str(extra_args) + '.')
 
-  def _validate_compile(self, optimizer, **kwargs):
+  def _validate_compile(self, optimizer, metrics, **kwargs):
     """Performs validation checks for the default `compile`."""
     if any(
         isinstance(opt, optimizers.Optimizer)
@@ -1460,6 +1495,22 @@ class Model(network.Network, version_utils.ModelVersionSelector):
               '  model=_create_model()\n'
               '  model.compile(...)' % (v, strategy))
 
+    # Model metrics must be created in the same distribution strategy scope
+    # as the model.
+    strategy = self._get_distribution_strategy()
+    for metric in nest.flatten(metrics):
+      for v in getattr(metric, 'variables', []):
+        if not strategy.extended.variable_created_in_scope(v):
+          raise ValueError(
+              'Metric (%s) passed to model.compile was created inside of a '
+              'different distribution strategy scope than the model. All '
+              'metrics must be created in the same distribution strategy '
+              'scope as the model (in this case %s). If you pass in a string '
+              'identifier for a metric to compile the metric will '
+              'automatically be created in the correct distribution '
+              'strategy scope.' % (metric, strategy)
+          )
+
   def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch):
     """Maybe load initial epoch from ckpt considering possible worker recovery.
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 6ee8971d567..0a1d4e0d920 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -43,6 +43,7 @@ from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
@@ -1255,6 +1256,50 @@ class TrainingTest(keras_parameterized.TestCase):
     with self.assertRaisesRegex(RuntimeError, 'must compile your model'):
       model.fit(np.random.random((32, 1)), epochs=2)
 
+  @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_losses_of_different_dtypes(self):
+    inp = keras.Input(shape=(2,))
+    out_1 = keras.layers.Dense(2, dtype='float32', kernel_regularizer='l2')(inp)
+    out_2 = keras.layers.Dense(2, dtype='float16', kernel_regularizer='l2')(inp)
+    model = keras.Model(inp, [out_1, out_2])
+    extra_loss = math_ops.reduce_sum(math_ops.cast(out_2, 'float64'))
+    model.add_loss(extra_loss)
+    model.compile('sgd', ['mse', 'mse'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 2)), np.ones((10, 2))
+    model.fit(x, [y, y])
+
+  @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_losses_of_different_dtypes_with_subclassed_model(self):
+    class MyModel(keras.Model):
+
+      def build(self, _):
+        self.dense = keras.layers.Dense(2)
+
+      def call(self, inputs):
+        self.add_loss(math_ops.cast(nn_ops.l2_loss(inputs), 'float64'))
+        return self.dense(inputs)
+
+    model = MyModel(dtype='float32')
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 2)), np.ones((10, 2))
+    model.fit(x, y)
+
+  @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_regularizer_of_different_dtype(self):
+    inp = keras.Input(shape=(2,))
+    def regularizer(weight):
+      return math_ops.cast(nn_ops.l2_loss(weight), 'float64')
+    out = keras.layers.Dense(2, dtype='float32',
+                             kernel_regularizer=regularizer)(inp)
+    model = keras.Model(inp, out)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 2)), np.ones((10, 2))
+    model.fit(x, y)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 9261ab30889..1c0fea91337 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -1546,7 +1546,7 @@ class Model(training_lib.Model):
     if self.run_eagerly:
       raise TypeError('total loss can not be computed when compiled with '
                       'run_eagerly = True.')
-    total_loss = None
+    loss_list = []
     with K.name_scope('loss'):
       for endpoint, mask in zip(self._training_endpoints, masks):
         if endpoint.should_skip_target():
@@ -1605,23 +1605,25 @@ class Model(training_lib.Model):
         if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
           output_loss = losses_utils.scale_loss_for_distribution(output_loss)
 
-        if total_loss is None:
-          total_loss = loss_weight * output_loss
-        else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
-        else:
-          total_loss = 0.
+        loss_list.append(loss_weight * output_loss)
+      if not loss_list and not self.losses:
+        raise ValueError('The model cannot be compiled '
+                         'because it has no loss to optimize.')
 
       # Add regularization penalties and other layer-specific losses.
       custom_losses = self.get_losses_for(None) + self.get_losses_for(
           self.inputs)
       if custom_losses:
-        total_loss += losses_utils.scale_loss_for_distribution(
-            math_ops.add_n(custom_losses))
+        total_custom_loss = math_ops.add_n(
+            losses_utils.cast_losses_to_common_dtype(custom_losses))
+        loss_list.append(
+            losses_utils.scale_loss_for_distribution(total_custom_loss))
+
+      loss_list = losses_utils.cast_losses_to_common_dtype(loss_list)
+      if loss_list:
+        total_loss = math_ops.add_n(loss_list)
+      else:
+        total_loss = 0.
     return total_loss
 
   def _get_callback_model(self):
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index d579b618e6b..6ead078c199 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -448,7 +448,7 @@ tf_py_test(
     size = "medium",
     srcs = ["convolutional_recurrent_test.py"],
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 6,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 32ad7a89b77..546e201bda4 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -127,9 +127,37 @@ class Masking(Layer):
 class Dropout(Layer):
   """Applies Dropout to the input.
 
-  Dropout consists in randomly setting
-  a fraction `rate` of input units to 0 at each update during training time,
-  which helps prevent overfitting.
+  The Dropout layer randomly sets input units to 0 with a frequency of `rate`
+  at each step during training time, which helps prevent overfitting.
+  Inputs not set to 0 are scaled up by 1/(1 - rate) such that the sum over
+  all inputs is unchanged.
+
+  Note that the Dropout layer only applies when `training` is set to True
+  such that no values are dropped during inference. When using `model.fit`,
+  `training` will be appropriately set to True automatically, and in other
+  contexts, you can set the kwarg explicitly to True when calling the layer.
+
+  (This is in contrast to setting `trainable=False` for a Dropout layer.
+  `trainable` does not affect the layer's behavior, as Dropout does
+  not have any variables/weights that can be frozen during training.)
+
+  >>> tf.random.set_seed(0)
+  >>> layer = tf.keras.layers.Dropout(.2, input_shape=(2,))
+  >>> data = np.arange(10).reshape(5, 2).astype(np.float32)
+  >>> print(data)
+  [[0. 1.]
+   [2. 3.]
+   [4. 5.]
+   [6. 7.]
+   [8. 9.]]
+  >>> outputs = layer(data, training=True)
+  >>> print(outputs)
+  tf.Tensor(
+  [[ 0.    1.25]
+   [ 2.5   3.75]
+   [ 5.    6.25]
+   [ 7.5   8.75]
+   [10.    0.  ]], shape=(5, 2), dtype=float32)
 
   Arguments:
     rate: Float between 0 and 1. Fraction of the input units to drop.
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 720e92483fb..e0dd9114755 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -303,10 +303,9 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "preprocessing_normalization_test",
+    name = "normalization_test",
     size = "small",
     srcs = ["normalization_test.py"],
-    main = "normalization_test.py",
     python_version = "PY3",
     deps = [
         ":normalization",
@@ -317,10 +316,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "preprocessing_text_vectorization_test",
+    name = "text_vectorization_test",
     size = "medium",
     srcs = ["text_vectorization_test.py"],
-    main = "text_vectorization_test.py",
     python_version = "PY3",
     deps = [
         ":preprocessing_test_utils",
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
index 0bd011646f8..a7dc159cdb6 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
@@ -292,6 +292,11 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     raise ValueError("Unknown output mode %s" % self._output_mode)
 
 
+class _CategoricalEncodingAccumulator(
+    collections.namedtuple("Accumulator", ["data", "per_doc_count_dict"])):
+  pass
+
+
 class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
   """Combiner for the CategoricalEncoding preprocessing layer.
 
@@ -307,8 +312,6 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
   # These are indices into the accumulator's `data` array.
   MAX_VALUE_IDX = 0
   DOC_ID_IDX = 1
-  ACCUMULATOR_CLS = collections.namedtuple("Accumulator",
-                                           ["data", "per_doc_count_dict"])
 
   def __init__(self, compute_max_element=True, compute_idf=False):
     self._compute_idf = compute_idf
@@ -452,4 +455,4 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
     else:
       per_doc_count_dict = None
     data = [0, 0]
-    return self.ACCUMULATOR_CLS(data, per_doc_count_dict)
+    return _CategoricalEncodingAccumulator(data, per_doc_count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 773fb1c8a3d..f78825cd453 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -215,10 +215,8 @@ class RandomCrop(Layer):
       crop_size = array_ops.stack(
           [input_shape[0], self.height, self.width, input_shape[3]])
       check = control_flow_ops.Assert(
-          math_ops.reduce_all(input_shape >= crop_size), [
-              'Need value.shape >= size, got input shape', input_shape,
-              ' but height is ', self.height, ' and weight is ', self.width
-          ])
+          math_ops.reduce_all(input_shape >= crop_size),
+          [self.height, self.width])
       input_shape = control_flow_ops.with_dependencies([check], input_shape)
       limit = input_shape - crop_size + 1
       offset = stateless_random_ops.stateless_random_uniform(
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 861e9fa16ab..b1b9914784c 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -44,7 +44,7 @@ class ResizingTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs.update({'height': expected_height, 'width': expected_width})
-    with self.cached_session(use_gpu=True):
+    with tf_test_util.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.Resizing,
           kwargs=kwargs,
@@ -115,7 +115,7 @@ class CenterCropTest(keras_parameterized.TestCase):
         (num_samples, orig_height, orig_width, channels)).astype(np.float32)
     expected_output = get_numpy_center_crop(
         input_images, expected_height, expected_width)
-    with self.cached_session(use_gpu=True):
+    with tf_test_util.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.CenterCrop,
           kwargs=kwargs,
@@ -172,7 +172,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     kwargs = {'height': expected_height, 'width': expected_width}
-    with self.cached_session(use_gpu=True):
+    with tf_test_util.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.RandomCrop,
           kwargs=kwargs,
@@ -203,7 +203,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     with test.mock.patch.object(
         stateless_random_ops, 'stateless_random_uniform',
         return_value=mock_offset):
-      with self.cached_session(use_gpu=True):
+      with tf_test_util.use_gpu():
         layer = image_preprocessing.RandomCrop(height, width)
         inp = np.random.random((12, 5, 8, 3))
         actual_output = layer(inp, training=1)
@@ -227,7 +227,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     np.random.seed(1337)
     height, width = 3, 3
     inp = np.random.random((12, 10, 6, 3))
-    with self.cached_session(use_gpu=True):
+    with tf_test_util.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
       resized_inp = image_ops.resize_images_v2(
@@ -239,7 +239,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     np.random.seed(1337)
     height, width = 4, 6
     inp = np.random.random((12, 8, 16, 3))
-    with self.cached_session(use_gpu=True):
+    with tf_test_util.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
       resized_inp = image_ops.resize_images_v2(
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 7bd7f6683d1..364b2e3fe25 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -30,8 +30,11 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -64,9 +67,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
     num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
       1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a
-      runtime error.
-    vocabulary: An optional list of vocabulary terms.
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. In either case, the vocabulary must be unique; if
+      the list or file contains the same token multiple times, an error will
+      be thrown. Note that when passing a vocabulary - either as a list or as
+      a file - the vocabulary will not be present in the layer's config dict;
+      it will instead be a part of the layer's weights.
     reserve_zero: Whether to reserve the index 0, which indicates pad values in
       the Keras masking system. If True, the output of this layer will be in the
       range `[1...max_tokens+1)`; if False, the output will be in the range
@@ -103,10 +114,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       raise ValueError("max_tokens must be greater than 1.")
 
     # For now, limit the num_oov_tokens to one.
-    if num_oov_tokens != 1:
-      raise ValueError("num_oov_tokens must be 1 for the time being. Other "
-                       "values will be supported in the near future. "
-                       "You passed %s" % num_oov_tokens)
+    if num_oov_tokens < 0:
+      raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
+                       num_oov_tokens)
 
     self.max_tokens = max_tokens
     self.num_oov_tokens = num_oov_tokens
@@ -164,10 +174,38 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     self._inverse_table = None
 
     if vocabulary is not None:
-      self._export_vocab = True
+      if isinstance(vocabulary, str):
+        vocabulary = self._get_vocabulary_from_file(vocabulary)
+
+      vocabulary_set = set(vocabulary)
+      if len(vocabulary) != len(vocabulary_set):
+        repeated_items = [
+            item for item, count in collections.Counter(vocabulary).items()
+            if count > 1
+        ]
+        raise ValueError("The passed vocabulary has at least one repeated "
+                         "term. Please uniquify your dataset before passing "
+                         "it to IndexLookup(). The repeated terms are %s" %
+                         repeated_items)
       self.set_vocabulary(vocabulary)
-    else:
-      self._export_vocab = False
+
+  def _get_vocabulary_from_file(self, vocabulary_path):
+    vocab = []
+    with gfile.GFile(vocabulary_path, "r") as reader:
+      while True:
+        # Get the next line, and break if it is None.
+        text = reader.readline()
+        if not text:
+          break
+
+        # Convert the raw text into UTF8 and strip whitespace.
+        if isinstance(text, str):
+          token = text
+        elif isinstance(text, bytes):
+          token = text.decode("utf-8", "ignore")
+        token = token.strip()
+        vocab.append(token)
+    return vocab
 
   def _get_table_data(self):
     keys, values = self._table.export()
@@ -256,11 +294,10 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     return [x for _, x in sorted(zip(values, keys))]
 
   def get_config(self):
-    vocabulary = self.get_vocabulary() if self._export_vocab else None
     config = {
         "max_tokens": self.max_tokens,
         "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": vocabulary,
+        "vocabulary": None,
         "reserve_zero": self.reserve_zero,
         "mask_zero": self.mask_zero,
     }
@@ -351,19 +388,38 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     return super(IndexLookup, self).__call__(inputs, invert=invert, **kwargs)
 
+  def replace_oov_buckets(self, inputs, lookups):
+    if self.num_oov_tokens <= 1:
+      return lookups
+
+    if inputs.dtype.is_integer:
+      inputs = string_ops.as_string(inputs)
+    hashed_inputs = string_ops.string_to_hash_bucket_fast(
+        inputs, num_buckets=self.num_oov_tokens)
+    if self.reserve_zero:
+      hashed_inputs = math_ops.add(hashed_inputs, 1)
+    return array_ops.where(math_ops.equal(lookups, -1), hashed_inputs, lookups)
+
   def call(self, inputs, invert=False):
     table = self._inverse_table if invert else self._table
     # The table lookup ops don't natively support ragged tensors, so if we have
     # a RT we need to use map_flat_values to look up every element.
     if ragged_tensor.is_ragged(inputs):
       indexed_data = ragged_functional_ops.map_flat_values(table.lookup, inputs)
+      if not invert:
+        indexed_data = ragged_functional_ops.map_flat_values(
+            self.replace_oov_buckets, inputs, indexed_data)
     elif isinstance(
         inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-      indexed_data = sparse_tensor.SparseTensor(inputs.indices,
-                                                table.lookup(inputs.values),
+      if not invert:
+        values = self.replace_oov_buckets(inputs.values,
+                                          table.lookup(inputs.values))
+      indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
                                                 inputs.dense_shape)
     else:
       indexed_data = table.lookup(inputs)
+      if not invert:
+        indexed_data = self.replace_oov_buckets(inputs, indexed_data)
       # (b/149446477): output does not preserve input shape.
       indexed_data.set_shape(inputs.shape)
 
@@ -373,6 +429,11 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     return array_ops.identity(indexed_data)
 
 
+class _IndexLookupAccumulator(
+    collections.namedtuple("Accumulator", ["count_dict"])):
+  pass
+
+
 class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
   """Combiner for the IndexLookup preprocessing layer.
 
@@ -385,7 +446,6 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
       set to a value greater than the total number of distinct tokens in the
       dataset, all tokens are retained.s
   """
-  ACCUMULATOR_CLS = collections.namedtuple("Accumulator", ["count_dict"])
 
   def __init__(self, vocab_size=None):
     self._vocab_size = vocab_size
@@ -461,4 +521,4 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
     """Accumulate a sorted array of vocab tokens and corresponding counts."""
 
     count_dict = collections.defaultdict(int)
-    return self.ACCUMULATOR_CLS(count_dict)
+    return _IndexLookupAccumulator(count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index fbb6062ce0b..de8d5623f5e 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -224,6 +225,84 @@ class CategoricalEncodingInputTest(
     self.assertAllEqual(expected_output, output_dataset)
 
 
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=["fire", "ohio"],
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
+    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 132], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(
+        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant(
+        [["earth", "wind", "fire"], ["fire", "and", "earth", "ohio"]])
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
+    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(
+        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
 @keras_parameterized.run_all_keras_modes
 class CategoricalEncodingAdaptTest(
     keras_parameterized.TestCase,
@@ -356,7 +435,22 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_int_output_explicit_vocab_from_config(self):
+
+@keras_parameterized.run_all_keras_modes
+class IndexLookupVocabularyTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
@@ -366,10 +460,22 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     layer = get_layer_class()(vocabulary=vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
 
-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    output_dataset = new_model.predict(input_array)
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_vocab_appending(self):
@@ -387,6 +493,17 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
 
 @keras_parameterized.run_all_keras_modes
 class InverseLookupOutputTest(keras_parameterized.TestCase,
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 150dd43d676..2b6b8a5b65a 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -63,9 +63,7 @@ class Normalization(CombinerPreprocessingLayer):
     dtype = dtype or K.floatx()
 
     super(Normalization, self).__init__(
-        combiner=Normalization._NormalizingCombiner(axis),
-        dtype=dtype,
-        **kwargs)
+        combiner=_NormalizingCombiner(axis), dtype=dtype, **kwargs)
 
     if axis == 0:
       raise ValueError('The argument \'axis\' may not be 0.')
@@ -129,112 +127,116 @@ class Normalization(CombinerPreprocessingLayer):
       weights.append(np.array(0))
     super(Normalization, self).set_weights(weights)
 
-  class _NormalizingCombiner(Combiner):
-    """Combiner for the Normalization preprocessing layer.
 
-    This class encapsulates the computations for finding the mean and variance
-    of a set of data in a stable and numerically correct way. Its associated
-    accumulator is a namedtuple('count', 'mean', 'variance').
+class _NormalizingAccumulator(
+    collections.namedtuple('_NormalizingAccumulator',
+                           ['count', 'mean', 'variance'])):
+  pass
 
-    Attributes:
-      axis: The axis to compute mean and var over.
-    """
 
-    def __init__(self, axis):
-      self.axis = axis
+class _NormalizingCombiner(Combiner):
+  """Combiner for the Normalization preprocessing layer.
 
-    def compute(self, values, accumulator=None):
-      """Compute a step in this computation, returning a new accumulator."""
+  This class encapsulates the computations for finding the mean and variance
+  of a set of data in a stable and numerically correct way. Its associated
+  accumulator is a namedtuple('count', 'mean', 'variance').
 
-      # This is the shape of all reduced axes (not specified in 'axis').
-      reduction_counts = np.delete(values.shape, self.axis)
-      # We get the number of elements that will be reduced by multiplying all
-      # values of 'shape' corresponding to the reduced axes.
-      count = np.prod(reduction_counts, dtype=np.int32)
+  Attributes:
+    axis: The axis to compute mean and var over.
+  """
 
-      # We want to reduce across dimensions except those specified in 'axis'
-      # when using np.mean or np.variance; create the tuple of axes to reduce
-      # over here.
-      reduction_axes = tuple(np.delete(range(values.ndim), self.axis))
-      mean = np.mean(values, axis=reduction_axes, dtype=np.float64)
-      variance = np.var(values, axis=reduction_axes, dtype=np.float64)
+  def __init__(self, axis):
+    self.axis = axis
 
-      # Create an accumulator with our new data and either return it or combine
-      # it with the passed accumulator.
-      sanitized_accumulator = self._create_accumulator(count, mean, variance)
-      if accumulator is None:
-        return sanitized_accumulator
-      else:
-        return self.merge([accumulator, sanitized_accumulator])
+  def compute(self, values, accumulator=None):
+    """Compute a step in this computation, returning a new accumulator."""
 
-    def merge(self, accumulators):
-      """Merge several accumulators to a single accumulator."""
-      # Combine accumulators and return the result.
-      combined_count = np.sum(
-          [accumulator.count for accumulator in accumulators])
+    # This is the shape of all reduced axes (not specified in 'axis').
+    reduction_counts = np.delete(values.shape, self.axis)
+    # We get the number of elements that will be reduced by multiplying all
+    # values of 'shape' corresponding to the reduced axes.
+    count = np.prod(reduction_counts, dtype=np.int32)
 
-      # To combine accumulator means, we weight each accumulator's mean by the
-      # number of elements that were accumulated, and then divide by the
-      # total number of elements.
-      combined_mean = np.add.reduce([
-          accumulator.mean * accumulator.count for accumulator in accumulators
-      ]) / combined_count
+    # We want to reduce across dimensions except those specified in 'axis'
+    # when using np.mean or np.variance; create the tuple of axes to reduce
+    # over here.
+    reduction_axes = tuple(np.delete(range(values.ndim), self.axis))
+    mean = np.mean(values, axis=reduction_axes, dtype=np.float64)
+    variance = np.var(values, axis=reduction_axes, dtype=np.float64)
 
-      # The variance is computed using the lack-of-fit sum of squares
-      # formula (see https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).
-      def variance_contribution(accumulator):
-        return accumulator.count * (
-            accumulator.variance + np.square(accumulator.mean - combined_mean))
+    # Create an accumulator with our new data and either return it or combine
+    # it with the passed accumulator.
+    sanitized_accumulator = self._create_accumulator(count, mean, variance)
+    if accumulator is None:
+      return sanitized_accumulator
+    else:
+      return self.merge([accumulator, sanitized_accumulator])
 
-      combined_variance = np.add.reduce([
-          variance_contribution(accumulator) for accumulator in accumulators
-      ]) / combined_count
+  def merge(self, accumulators):
+    """Merge several accumulators to a single accumulator."""
+    # Combine accumulators and return the result.
+    combined_count = np.sum([accumulator.count for accumulator in accumulators])
 
-      return self._create_accumulator(combined_count, combined_mean,
-                                      combined_variance)
+    # To combine accumulator means, we weight each accumulator's mean by the
+    # number of elements that were accumulated, and then divide by the
+    # total number of elements.
+    combined_mean = np.add.reduce([
+        accumulator.mean * accumulator.count for accumulator in accumulators
+    ]) / combined_count
 
-    def extract(self, accumulator):
-      """Convert an accumulator into a dict of output values."""
-      return {
-          _COUNT_NAME: accumulator.count,
-          _MEAN_NAME: accumulator.mean,
-          _VARIANCE_NAME: accumulator.variance
-      }
+    # The variance is computed using the lack-of-fit sum of squares
+    # formula (see https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).
+    def variance_contribution(accumulator):
+      return accumulator.count * (
+          accumulator.variance + np.square(accumulator.mean - combined_mean))
 
-    def restore(self, output):
-      """Create an accumulator based on 'output'."""
-      # There is no special internal state here, so we just return the relevant
-      # internal value.
-      count = output[_COUNT_NAME]
-      mean = output[_MEAN_NAME]
-      var = output[_VARIANCE_NAME]
-      if (count == 0 and (mean.any() != 0.0 or var.any() != 0.0)):
-        raise RuntimeError(
-            'The mean and/or variance of a Normalization preprocessing layer '
-            "were set without also setting 'count'. If 'count' is not also set,"
-            " 'adapt' cannot be called unless the 'reset_state' arg is True.")
-      return self._create_accumulator(output[_COUNT_NAME], output[_MEAN_NAME],
-                                      output[_VARIANCE_NAME])
+    combined_variance = np.add.reduce([
+        variance_contribution(accumulator) for accumulator in accumulators
+    ]) / combined_count
 
-    def serialize(self, accumulator):
-      """Serialize an accumulator for a remote call."""
-      output_dict = {
-          _COUNT_NAME: accumulator.count.tolist(),
-          _MEAN_NAME: accumulator.mean.tolist(),
-          _VARIANCE_NAME: accumulator.variance.tolist()
-      }
-      return compat.as_bytes(json.dumps(output_dict))
+    return self._create_accumulator(combined_count, combined_mean,
+                                    combined_variance)
 
-    def deserialize(self, encoded_accumulator):
-      """Deserialize an accumulator received from 'serialize()'."""
-      value_dict = json.loads(compat.as_text(encoded_accumulator))
-      return self._create_accumulator(
-          np.array(value_dict[_COUNT_NAME]), np.array(value_dict[_MEAN_NAME]),
-          np.array(value_dict[_VARIANCE_NAME]))
+  def extract(self, accumulator):
+    """Convert an accumulator into a dict of output values."""
+    return {
+        _COUNT_NAME: accumulator.count,
+        _MEAN_NAME: accumulator.mean,
+        _VARIANCE_NAME: accumulator.variance
+    }
 
-    def _create_accumulator(self, count, mean, variance):
-      """Convert any 'nan' values in the given accumulator to numeric values."""
-      return collections.namedtuple(
-          'Accumulator', ['count', 'mean', 'variance'])(np.array(count),
-                                                        np.nan_to_num(mean),
-                                                        np.nan_to_num(variance))
+  def restore(self, output):
+    """Create an accumulator based on 'output'."""
+    # There is no special internal state here, so we just return the relevant
+    # internal value.
+    count = output[_COUNT_NAME]
+    mean = output[_MEAN_NAME]
+    var = output[_VARIANCE_NAME]
+    if (count == 0 and (mean.any() != 0.0 or var.any() != 0.0)):
+      raise RuntimeError(
+          'The mean and/or variance of a Normalization preprocessing layer '
+          "were set without also setting 'count'. If 'count' is not also set,"
+          " 'adapt' cannot be called unless the 'reset_state' arg is True.")
+    return self._create_accumulator(output[_COUNT_NAME], output[_MEAN_NAME],
+                                    output[_VARIANCE_NAME])
+
+  def serialize(self, accumulator):
+    """Serialize an accumulator for a remote call."""
+    output_dict = {
+        _COUNT_NAME: accumulator.count.tolist(),
+        _MEAN_NAME: accumulator.mean.tolist(),
+        _VARIANCE_NAME: accumulator.variance.tolist()
+    }
+    return compat.as_bytes(json.dumps(output_dict))
+
+  def deserialize(self, encoded_accumulator):
+    """Deserialize an accumulator received from 'serialize()'."""
+    value_dict = json.loads(compat.as_text(encoded_accumulator))
+    return self._create_accumulator(
+        np.array(value_dict[_COUNT_NAME]), np.array(value_dict[_MEAN_NAME]),
+        np.array(value_dict[_VARIANCE_NAME]))
+
+  def _create_accumulator(self, count, mean, variance):
+    """Convert any 'nan' values in the given accumulator to numeric values."""
+    return _NormalizingAccumulator(
+        np.array(count), np.nan_to_num(mean), np.nan_to_num(variance))
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 227e961751e..1b6eb7ae5e8 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -113,7 +113,7 @@ class NormalizationTest(keras_parameterized.TestCase,
 
   def test_combiner_api_compatibility(self):
     data = np.array([[1], [2], [3], [4], [5]])
-    combiner = normalization.Normalization._NormalizingCombiner(axis=-1)
+    combiner = normalization._NormalizingCombiner(axis=-1)
     expected = {
         "count": np.array(5.0),
         "variance": np.array([2.]),
@@ -180,7 +180,7 @@ class NormalizationTest(keras_parameterized.TestCase,
               "3d_multi_element_internal_axis"
       })
   def test_combiner_computation_multi_value_axis(self, data, axis, expected):
-    combiner = normalization.Normalization._NormalizingCombiner(axis=axis)
+    combiner = normalization._NormalizingCombiner(axis=axis)
     expected_accumulator = combiner._create_accumulator(**expected)
     self.validate_accumulator_computation(combiner, data, expected_accumulator)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py
index 1a07b8fe703..1cc48be1e3d 100644
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py
@@ -64,8 +64,8 @@ class PreprocessingStageTest(
     self.assertEqual(stage.layers[0].adapt_count, 1)
     self.assertEqual(stage.layers[1].adapt_count, 1)
     self.assertEqual(stage.layers[2].adapt_count, 1)
-    self.assertLess(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
-    self.assertLess(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
+    self.assertLessEqual(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
+    self.assertLessEqual(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
 
     # Check call
     y = stage(array_ops.ones((3, 4)))
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index da5fb687eb2..4e8edf5cc98 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -633,6 +633,12 @@ class TextVectorization(CombinerPreprocessingLayer):
     return self._vectorize_layer(indexed_data)
 
 
+class _TextVectorizationAccumulator(
+    collections.namedtuple("_TextVectorizationAccumulator",
+                           ["count_dict", "per_doc_count_dict", "metadata"])):
+  pass
+
+
 # A note on this combiner: This contains functionality that will be extracted
 # into the Vectorization and IndexLookup combiner objects. At that point,
 # TextVectorization can become a PreprocessingStage instead of a Layer and
@@ -797,8 +803,6 @@ class _TextVectorizationCombiner(Combiner):
 
   def _create_accumulator(self):
     """Accumulate a sorted array of vocab tokens and corresponding counts."""
-    accumulator = collections.namedtuple(
-        "Accumulator", ["count_dict", "per_doc_count_dict", "metadata"])
 
     count_dict = collections.defaultdict(int)
     if self._compute_idf:
@@ -807,4 +811,5 @@ class _TextVectorizationCombiner(Combiner):
     else:
       per_doc_count_dict = None
     metadata = [0]
-    return accumulator(count_dict, per_doc_count_dict, metadata)
+    return _TextVectorizationAccumulator(count_dict, per_doc_count_dict,
+                                         metadata)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index 8ec8d914cf5..60396b05a17 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -525,10 +525,8 @@ class KerasModelTest(keras_parameterized.TestCase):
             input_shape=(1,))
         if use_input_spec:
           layer.input_spec = input_spec.InputSpec(shape=(2, 1))
-        cast_f32_layer = layers.Lambda(lambda x: math_ops.cast(x, 'float32'))
-        model = testing_utils.get_model_from_layers(
-            [layer, cast_f32_layer], input_shape=(1,),
-            input_dtype=dtypes.float16)
+        model = testing_utils.get_model_from_layers([layer], input_shape=(1,),
+                                                    input_dtype=dtypes.float16)
         if get_config:
           config = model.get_config()
           model = model.__class__.from_config(
@@ -712,12 +710,10 @@ class KerasModelTest(keras_parameterized.TestCase):
                   expected_dtype=dtypes.float16,
                   expected_gradient=[expected_gradient]))
           y = core.Lambda(identity_with_grad_check_fn)(y)
-        y = math_ops.cast(y, dtypes.float32)
         model = models.Model(inputs=x, outputs=y)
 
         def loss_fn(y_true, y_pred):
-          self.assertEqual(y_true.dtype, dtypes.float32)
-          self.assertEqual(y_pred.dtype, dtypes.float32)
+          del y_true
           return math_ops.reduce_mean(y_pred)
 
         opt = gradient_descent.SGD(learning_rate)
@@ -804,7 +800,6 @@ class KerasModelTest(keras_parameterized.TestCase):
                 expected_dtype=dtypes.float16,
                 expected_gradient=expected_gradient))
         y = core.Lambda(identity_with_grad_check_fn)(y)
-        y = math_ops.cast(y, dtypes.float32)
         model = models.Model(inputs=x, outputs=y)
         if get_config:
           config = model.get_config()
@@ -914,7 +909,6 @@ class KerasModelTest(keras_parameterized.TestCase):
         x = layers.Input(shape=(1,), batch_size=2)
         layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16)
         y = layer(x)
-        y = math_ops.cast(y, dtypes.float32)
         model = models.Model(inputs=x, outputs=y)
 
     model.set_weights([np.array(100.)])
@@ -960,7 +954,6 @@ class KerasModelTest(keras_parameterized.TestCase):
       layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                          var_name=var_name)
       y = layer(x)
-      y = math_ops.cast(y, dtypes.float32)
       model = models.Model(inputs=x, outputs=y)
       opt = gradient_descent.SGD(1., 1.)
       model.compile(
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index e2efd717d9e..15aa41646f2 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -221,14 +221,17 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
     grads = self._optimizer.get_gradients(loss, params)
     return self.get_unscaled_gradients(grads)
 
-  def apply_gradients(self, grads_and_vars, name=None):
+  def apply_gradients(self, grads_and_vars, name=None,
+                      all_reduce_sum_gradients=True):
     if distribution_strategy_context.in_cross_replica_context():
       raise ValueError('apply_gradients() must be called in a replica context.')
     grads_and_vars = tuple(grads_and_vars)
     return distribution_strategy_context.get_replica_context().merge_call(
-        self._apply_gradients_cross_replica, args=(grads_and_vars, name))
+        self._apply_gradients_cross_replica,
+        args=(grads_and_vars, name, all_reduce_sum_gradients))
 
-  def _apply_gradients_cross_replica(self, distribution, grads_and_vars, name):
+  def _apply_gradients_cross_replica(self, distribution, grads_and_vars, name,
+                                     all_reduce_sum_gradients):
     grads = [g for g, _ in grads_and_vars]
     loss_scale_update_op, should_apply_grads = self._loss_scale.update(grads)
 
@@ -240,7 +243,8 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
       # MirroredVariables.
       wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
       return distribution.extended.call_for_each_replica(
-          self._apply_gradients, args=(grads, wrapped_vars, name))
+          self._apply_gradients, args=(grads, wrapped_vars, name,
+                                       all_reduce_sum_gradients))
 
     # Note: We must call this cond() in a cross-replica context.
     # DistributionStrategy does not support having a cond in a replica context
@@ -251,9 +255,10 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
                                            control_flow_ops.no_op)
     return control_flow_ops.group(maybe_apply_op, loss_scale_update_op)
 
-  def _apply_gradients(self, grads, wrapped_vars, name):
+  def _apply_gradients(self, grads, wrapped_vars, name,
+                       all_reduce_sum_gradients):
     return self._optimizer.apply_gradients(list(zip(grads, wrapped_vars.value)),
-                                           name)
+                                           name, all_reduce_sum_gradients)
 
   def get_config(self):
     serialized_optimizer = optimizers.serialize(self._optimizer)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 98e7e0bcaa1..8022daf1218 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -369,10 +369,13 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
     class MyOptimizer(gradient_descent.SGD):
 
-      def apply_gradients(self, grads_and_vars, name=None):
+      def apply_gradients(self, grads_and_vars, name=None,
+                          all_reduce_sum_gradients=True):
         for grad, _ in grads_and_vars:
           outer_self.assertIsInstance(grad, ops.Tensor)
-        return super(MyOptimizer, self).apply_gradients(grads_and_vars, name)
+        return super(MyOptimizer,
+                     self).apply_gradients(grads_and_vars, name,
+                                           all_reduce_sum_gradients)
 
     with create_mirrored_strategy().scope() as strategy:
       var = variables.Variable([5.0])
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 6b73963530f..fadc0a225eb 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -445,8 +445,8 @@ class OptimizerV2(trackable.Trackable):
 
     Args:
       grads_and_vars: List of (gradient, variable) pairs.
-      name: Optional name for the returned operation.  Default to the name
-        passed to the `Optimizer` constructor.
+      name: Optional name for the returned operation. Default to the name passed
+        to the `Optimizer` constructor.
       all_reduce_sum_gradients: Whether to sum gradients from different
         replicas in the presense of `tf.distribute.Strategy`. If False, it's
         user responsibility to aggregate the gradients. Default to True.
@@ -473,6 +473,13 @@ class OptimizerV2(trackable.Trackable):
         # Distribution strategy does not support reducing an empty list of
         # gradients
         return control_flow_ops.no_op()
+
+      if distribute_ctx.in_cross_replica_context():
+        raise RuntimeError(
+            "`apply_gradients() cannot be called in cross-replica context. "
+            "Use `tf.distribute.Strategy.experimental_run_v2` to enter replica "
+            "context.")
+
       apply_state = self._prepare(var_list)
       return distribute_ctx.get_replica_context().merge_call(
           functools.partial(self._distributed_apply, apply_state=apply_state),
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index ff78af29f74..640e47a1d44 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -26,10 +26,12 @@ py_library(
     name = "image",
     srcs = [
         "image.py",
+        "image_pipeline.py",
     ],
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/layers/preprocessing:image_preprocessing",
         "//tensorflow/python/keras/utils:data_utils",
     ],
 )
@@ -65,6 +67,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "image_pipeline_test",
+    size = "small",
+    srcs = ["image_pipeline_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":image",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "sequence_test",
     size = "small",
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index c208db9ea9e..d44fd421929 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -50,22 +50,24 @@ def array_to_img(x, data_format=None, scale=True, dtype=None):
 
   Usage:
 
-  >>> img = np.random.random(size=(100, 100, 3))
-  >>> try:
-  ...   from PIL import Image
-  ...   pil_img = tf.keras.preprocessing.image.array_to_img(img)
-  ... except ImportError:
-  ...   pass
+  ```python
+  from PIL import Image
+  img = np.random.random(size=(100, 100, 3))
+  pil_img = tf.keras.preprocessing.image.array_to_img(img)
+  ```
+
 
   Arguments:
       x: Input Numpy array.
       data_format: Image data format, can be either "channels_first" or
-        "channels_last". Defaults to `None`, which gets data format from Keras
-        backend.
+        "channels_last". Defaults to `None`, in which case the global setting
+        `tf.keras.backend.image_data_format()` is used (unless you changed it,
+        it defaults to "channels_last").
       scale: Whether to rescale image values to be within `[0, 255]`. Defaults
         to `True`.
-      dtype: Dtype to use. Default to `None`, which gets float type from Keras
-        backend.
+      dtype: Dtype to use. Default to `None`, in which case the global setting
+      `tf.keras.backend.floatx()` is used (unless you changed it, it defaults
+      to "float32")
 
   Returns:
       A PIL Image instance.
@@ -89,11 +91,25 @@ def array_to_img(x, data_format=None, scale=True, dtype=None):
 def img_to_array(img, data_format=None, dtype=None):
   """Converts a PIL Image instance to a Numpy array.
 
+  Usage:
+
+  ```python
+  from PIL import Image
+  img_data = np.random.random(size=(100, 100, 3))
+  img = tf.keras.preprocessing.image.array_to_img(img_data)
+  array = tf.keras.preprocessing.image.img_to_array(img)
+  ```
+
+
   Arguments:
-      img: PIL Image instance.
-      data_format: Image data format,
-          either "channels_first" or "channels_last".
-      dtype: Dtype to use for the returned array.
+      img: Input PIL Image instance.
+      data_format: Image data format, can be either "channels_first" or
+        "channels_last". Defaults to `None`, in which case the global setting
+        `tf.keras.backend.image_data_format()` is used (unless you changed it,
+        it defaults to "channels_last").
+      dtype: Dtype to use. Default to `None`, in which case the global setting
+      `tf.keras.backend.floatx()` is used (unless you changed it, it defaults
+      to "float32")
 
   Returns:
       A 3D Numpy array.
diff --git a/tensorflow/python/keras/preprocessing/image_pipeline.py b/tensorflow/python/keras/preprocessing/image_pipeline.py
new file mode 100644
index 00000000000..9cd8ed9d122
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/image_pipeline.py
@@ -0,0 +1,329 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras image dataset loading utilities."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
+
+
+WHITELIST_FORMATS = ('.bmp', '.gif', '.jpeg', '.jpg', '.png')
+
+
+def dataset_from_directory(directory,
+                           labels='inferred',
+                           label_mode='int',
+                           class_names=None,
+                           color_mode='rgb',
+                           batch_size=32,
+                           image_size=(256, 256),
+                           shuffle=True,
+                           seed=None,
+                           follow_links=False,
+                           validation_split=None,
+                           subset=None,
+                           interpolation='bilinear'):
+  """Generates a Dataset from image files in a directory.
+
+  If your directory structure is:
+
+  ```
+  main_directory/
+  ...class_a/
+  ......a_image_1.jpg
+  ......a_image_2.jpg
+  ...class_b/
+  ......b_image_1.jpg
+  ......b_image_2.jpg
+  ```
+
+  Then calling `from_directory(main_directory, labels='inferred')`
+  will return a Dataset that yields batches of images from
+  the subdirectories `class_a` and `class_b`, together with labels
+  0 and 1 (0 corresponding to class_a and 1 corresponding to class_b).
+
+  Supported image formats: jpeg, png, bmp, gif.
+  Animated gifs are truncated to the first frame.
+
+  Arguments:
+    directory: Directory where the data is located.
+        If `labels` is "inferred", it should contain
+        subdirectories, each containing images for a class.
+        Otherwise, the directory structure is ignored.
+    labels: Either "inferred"
+        (labels are generated from the directory structure),
+        or a list/tuple of integer labels of the same size as the number of
+        image files found in the directory. Labels should be sorted according
+        to the alphanumeric order of the image file paths
+        (obtained via `os.walk(directory)` in Python).
+    label_mode:
+        - 'int': means that the labels are encoded as integers
+            (e.g. for `sparse_categorical_crossentropy` loss).
+        - 'categorical' means that the labels are
+            encoded as a categorical vector
+            (e.g. for `categorical_crossentropy` loss).
+        - 'binary' means that the labels (there can be only 2)
+            are encoded as `float32` scalars with values 0 or 1
+            (e.g. for `binary_crossentropy`).
+        - None (no labels).
+    class_names: Only valid if "labels" is "inferred". This is the explict
+        list of class names (must match names of subdirectories). Used
+        to control the order of the classes
+        (otherwise alphanumerical order is used).
+    color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
+        Whether the images will be converted to
+        have 1, 3, or 4 channels.
+    batch_size: Size of the batches of data. Default: 32.
+    image_size: Size to resize images to after they are read from disk.
+        Defaults to `(256, 256)`.
+        Since the pipeline processes batches of images that must all have
+        the same size, this must be provided.
+    shuffle: Whether to shuffle the data. Default: True.
+        If set to False, sorts the data in alphanumeric order.
+    seed: Optional random seed for shuffling and transformations.
+    follow_links: Whether to visits subdirectories pointed to by symlinks.
+        Defaults to False.
+    validation_split: Optional float between 0 and 1,
+        fraction of data to reserve for validation.
+    subset: One of "training" or "validation".
+        Only used if `validation_split` is set.
+    interpolation: String, the interpolation method sed when resizing images.
+      Defaults to `bilinear`. Supports `bilinear`, `nearest`, `bicubic`,
+      `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
+
+  Returns:
+    A `tf.data.Dataset` object.
+      - If `label_mode` is None, it yields `float32` tensors of shape
+        `(batch_size, image_size[0], image_size[1], num_channels)`,
+        encoding images (see below for rules regarding `num_channels`).
+      - Otherwise, it yields a tuple `(images, labels)`, where `images`
+        has shape `(batch_size, image_size[0], image_size[1], num_channels)`,
+        and `labels` follows the format described below.
+
+    Rules regarding labels format:
+      - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+        `(batch_size,)`.
+      - if `label_mode` is `binary`, the labels are a `float32` tensor of
+        1s and 0s of shape `(batch_size, 1)`.
+      - if `label_mode` is `categorial`, the labels are a `float32` tensor
+        of shape `(batch_size, num_classes)`, representing a one-hot
+        encoding of the class index.
+
+    Rules regarding number of channels in the yielded images:
+      - if `color_mode` is `grayscale`,
+        there's 1 channel in the image tensors.
+      - if `color_mode` is `rgb`,
+        there are 3 channel in the image tensors.
+      - if `color_mode` is `rgba`,
+        there are 4 channel in the image tensors.
+  """
+  if labels != 'inferred':
+    if not isinstance(labels, (list, tuple)):
+      raise ValueError(
+          '`labels` argument should be a list/tuple of integer labels, of '
+          'the same size as the number of image files in the target '
+          'directory. If you wish to infer the labels from the subdirectory '
+          'names in the target directory, pass `labels="inferred"`. '
+          'If you wish to get a dataset that only contains images '
+          '(no labels), pass `labels=None`.')
+    if class_names:
+      raise ValueError('You can only pass `class_names` if the labels are '
+                       'inferred from the subdirectory names in the target '
+                       'directory (`labels="inferred"`).')
+  if label_mode not in {'int', 'categorical', 'binary', None}:
+    raise ValueError(
+        '`label_mode` argument must be one of "int", "categorical", "binary", '
+        'or None. Received: %s' % (label_mode,))
+  if color_mode == 'rgb':
+    num_channels = 3
+  elif color_mode == 'rgba':
+    num_channels = 4
+  elif color_mode == 'grayscale':
+    num_channels = 1
+  else:
+    raise ValueError(
+        '`color_mode` must be one of {"rbg", "rgba", "grayscale"}. '
+        'Received: %s' % (color_mode,))
+  interpolation = image_preprocessing.get_interpolation(interpolation)
+
+  inferred_class_names = []
+  for subdir in sorted(os.listdir(directory)):
+    if os.path.isdir(os.path.join(directory, subdir)):
+      inferred_class_names.append(subdir)
+  if not class_names:
+    class_names = inferred_class_names
+  else:
+    if set(class_names) != set(inferred_class_names):
+      raise ValueError(
+          'The `class_names` passed did not match the '
+          'names of the subdirectories of the target directory. '
+          'Expected: %s, but received: %s' %
+          (inferred_class_names, class_names))
+  class_indices = dict(zip(class_names, range(len(class_names))))
+
+  if label_mode == 'binary' and len(class_names) != 2:
+    raise ValueError(
+        'When passing `label_mode="binary", there must exactly 2 classes. '
+        'Found the following classes: %s' % (class_names,))
+
+  # Build an index of the images
+  # in the different class subfolders.
+  pool = multiprocessing.pool.ThreadPool()
+  results = []
+  filenames = []
+  for dirpath in (os.path.join(directory, subdir) for subdir in class_names):
+    results.append(
+        pool.apply_async(list_labeled_images_in_directory,
+                         (dirpath, class_indices, follow_links)))
+  labels_list = []
+  for res in results:
+    partial_labels, partial_filenames = res.get()
+    labels_list.append(partial_labels)
+    filenames += partial_filenames
+  if labels != 'inferred':
+    if len(labels) != len(filenames):
+      raise ValueError('Expected the lengths of `labels` to match the number '
+                       'of images in the target directory. len(labels) is %s '
+                       'while we found %s images in %s.' % (
+                           len(labels), len(filenames), directory))
+  else:
+    i = 0
+    labels = np.zeros((len(filenames),), dtype='int32')
+    for partial_labels in labels_list:
+      labels[i:i + len(partial_labels)] = partial_labels
+      i += len(partial_labels)
+
+  print('Found %d images belonging to %d classes.' %
+        (len(filenames), len(class_names)))
+  pool.close()
+  pool.join()
+  image_paths = [os.path.join(directory, fname) for fname in filenames]
+
+  if shuffle:
+    # Shuffle globally to erase macro-structure
+    # (the dataset will be further shuffled within a local buffer
+    # at each iteration)
+    if seed is None:
+      seed = np.random.randint(1e6)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(image_paths)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(labels)
+
+  if validation_split:
+    if not 0 < validation_split < 1:
+      raise ValueError(
+          '`validation_split` must be between 0 and 1, received: %s' %
+          (validation_split,))
+    num_val_samples = int(validation_split * len(image_paths))
+    if subset == 'training':
+      image_paths = image_paths[:-num_val_samples]
+      labels = labels[:-num_val_samples]
+    elif subset == 'validation':
+      image_paths = image_paths[-num_val_samples:]
+      labels = labels[-num_val_samples:]
+    else:
+      raise ValueError('`subset` must be either "training" '
+                       'or "validation", received: %s' % (subset,))
+  dataset = paths_and_labels_to_dataset(
+      image_paths=image_paths,
+      image_size=image_size,
+      num_channels=num_channels,
+      labels=labels,
+      label_mode=label_mode,
+      num_classes=len(class_names),
+      interpolation=interpolation)
+  if shuffle:
+    # Shuffle locally at each iteration
+    dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+  dataset = dataset.batch(batch_size)
+  return dataset
+
+
+def paths_and_labels_to_dataset(image_paths,
+                                image_size,
+                                num_channels,
+                                labels,
+                                label_mode,
+                                num_classes,
+                                interpolation):
+  """Constructs a dataset of images and labels."""
+  # TODO(fchollet): consider making num_parallel_calls settable
+  path_ds = dataset_ops.Dataset.from_tensor_slices(image_paths)
+  img_ds = path_ds.map(
+      lambda x: path_to_image(x, image_size, num_channels, interpolation))
+  if label_mode:
+    label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
+    if label_mode == 'binary':
+      label_ds = label_ds.map(
+          lambda x: array_ops.expand_dims(math_ops.cast(x, 'float32'), axis=-1))
+    elif label_mode == 'categorical':
+      label_ds = label_ds.map(lambda x: array_ops.one_hot(x, num_classes))
+    img_ds = dataset_ops.Dataset.zip((img_ds, label_ds))
+  return img_ds
+
+
+def iter_valid_files(directory, follow_links):
+  walk = os.walk(directory, followlinks=follow_links)
+  for root, _, files in sorted(walk, key=lambda x: x[0]):
+    for fname in sorted(files):
+      if fname.lower().endswith(WHITELIST_FORMATS):
+        yield root, fname
+
+
+def list_labeled_images_in_directory(directory, class_indices, follow_links):
+  """Recursively walks directory and list image paths and their class index.
+
+  Arguments:
+    directory: string, target directory.
+    class_indices: dict mapping class names to their index.
+    follow_links: boolean, whether to recursively follow subdirectories
+      (if False, we only list top-level images in `directory`).
+
+  Returns:
+    tuple `(labels, filenames)`. `labels` is a list of integer
+      labels and `filenames` is a list of relative image paths corresponding
+      to these labels.
+  """
+  dirname = os.path.basename(directory)
+  valid_files = iter_valid_files(directory, follow_links)
+  labels = []
+  filenames = []
+  for root, fname in valid_files:
+    labels.append(class_indices[dirname])
+    absolute_path = os.path.join(root, fname)
+    relative_path = os.path.join(
+        dirname, os.path.relpath(absolute_path, directory))
+    filenames.append(relative_path)
+  return labels, filenames
+
+
+def path_to_image(path, image_size, num_channels, interpolation):
+  img = io_ops.read_file(path)
+  img = image_ops.decode_image(
+      img, channels=num_channels, expand_animations=False)
+  return image_ops.resize_images_v2(img, image_size, method=interpolation)
diff --git a/tensorflow/python/keras/preprocessing/image_pipeline_test.py b/tensorflow/python/keras/preprocessing/image_pipeline_test.py
new file mode 100644
index 00000000000..cae0780d90b
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/image_pipeline_test.py
@@ -0,0 +1,286 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image_pipeline."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.preprocessing import image as image_preproc
+from tensorflow.python.keras.preprocessing import image_pipeline
+from tensorflow.python.platform import test
+
+try:
+  import PIL  # pylint:disable=g-import-not-at-top
+except ImportError:
+  PIL = None
+
+
+class DatasetFromDirectoryTest(keras_parameterized.TestCase):
+
+  def _get_images(self, count=16, color_mode='rgb'):
+    width = height = 24
+    imgs = []
+    for _ in range(count):
+      if color_mode == 'grayscale':
+        img = np.random.randint(0, 256, size=(height, width, 1))
+      elif color_mode == 'rgba':
+        img = np.random.randint(0, 256, size=(height, width, 4))
+      else:
+        img = np.random.randint(0, 256, size=(height, width, 3))
+      img = image_preproc.array_to_img(img)
+      imgs.append(img)
+    return imgs
+
+  def _prepare_directory(self,
+                         num_classes=2,
+                         grayscale=False,
+                         nested_dirs=False,
+                         color_mode='rgb',
+                         count=16):
+    # Get a unique temp directory
+    temp_dir = os.path.join(self.get_temp_dir(), str(np.random.randint(1e6)))
+    os.mkdir(temp_dir)
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    # Generate paths to class subdirectories
+    paths = []
+    for class_index in range(num_classes):
+      class_directory = 'class_%s' % (class_index,)
+      if nested_dirs:
+        class_paths = [
+            class_directory, os.path.join(class_directory, 'subfolder_1'),
+            os.path.join(class_directory, 'subfolder_2'), os.path.join(
+                class_directory, 'subfolder_1', 'sub-subfolder')
+        ]
+      else:
+        class_paths = [class_directory]
+      for path in class_paths:
+        os.mkdir(os.path.join(temp_dir, path))
+      paths += class_paths
+
+    # Save images to the paths
+    i = 0
+    for img in self._get_images(color_mode=color_mode, count=count):
+      path = paths[count % len(paths)]
+      if color_mode == 'rgb':
+        ext = 'jpg'
+      else:
+        ext = 'png'
+      filename = os.path.join(path, 'image_%s.%s' % (i, ext))
+      img.save(os.path.join(temp_dir, filename))
+      i += 1
+    return temp_dir
+
+  def test_dataset_from_directory_binary(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='int')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8,))
+    self.assertEqual(batch[1].dtype.name, 'int32')
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='binary')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8, 1))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8, 2))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+  def test_sample_count(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=4, count=15)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None)
+    sample_count = 0
+    for batch in dataset:
+      sample_count += batch.shape[0]
+    self.assertEqual(sample_count, 15)
+
+  def test_dataset_from_directory_multiclass(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=4, count=15)
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None)
+    batch = next(iter(dataset))
+    self.assertEqual(batch.shape, (8, 18, 18, 3))
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None)
+    sample_count = 0
+    iterator = iter(dataset)
+    for batch in dataset:
+      sample_count += next(iterator).shape[0]
+    self.assertEqual(sample_count, 15)
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='int')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8,))
+    self.assertEqual(batch[1].dtype.name, 'int32')
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8, 4))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+  def test_dataset_from_directory_color_modes(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=4, color_mode='rgba')
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), color_mode='rgba')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 4))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+
+    directory = self._prepare_directory(num_classes=4, color_mode='grayscale')
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), color_mode='grayscale')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 1))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+
+  def test_dataset_from_directory_validation_split(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2, count=10)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=10, image_size=(18, 18),
+        validation_split=0.2, subset='training')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=10, image_size=(18, 18),
+        validation_split=0.2, subset='validation')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (2, 18, 18, 3))
+
+  def test_dataset_from_directory_manual_labels(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2, count=2)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18),
+        labels=[0, 1], shuffle=False)
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertAllClose(batch[1], [0, 1])
+
+  def test_dataset_from_directory_follow_links(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2, count=25,
+                                        nested_dirs=True)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None,
+        follow_links=True)
+    sample_count = 0
+    for batch in dataset:
+      sample_count += batch.shape[0]
+    self.assertEqual(sample_count, 25)
+
+  def test_dataset_from_directory_errors(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=3, count=5)
+
+    with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, labels=None)
+
+    with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
+      _ = image_pipeline.dataset_from_directory(directory, label_mode='other')
+
+    with self.assertRaisesRegex(ValueError, '`color_mode` must be one of'):
+      _ = image_pipeline.dataset_from_directory(directory, color_mode='other')
+
+    with self.assertRaisesRegex(
+        ValueError, 'only pass `class_names` if the labels are inferred'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, labels=[0, 0, 1, 1, 1],
+          class_names=['class_0', 'class_1', 'class_2'])
+
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected the lengths of `labels` to match the number of images'):
+      _ = image_pipeline.dataset_from_directory(directory, labels=[0, 0, 1, 1])
+
+    with self.assertRaisesRegex(
+        ValueError, '`class_names` passed did not match'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, class_names=['class_0', 'class_2'])
+
+    with self.assertRaisesRegex(ValueError, 'there must exactly 2 classes'):
+      _ = image_pipeline.dataset_from_directory(directory, label_mode='binary')
+
+    with self.assertRaisesRegex(ValueError,
+                                '`validation_split` must be between 0 and 1'):
+      _ = image_pipeline.dataset_from_directory(directory, validation_split=2)
+
+    with self.assertRaisesRegex(ValueError,
+                                '`subset` must be either "training" or'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, validation_split=0.2, subset='other')
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 3a4bca18e40..7ab6639d118 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -19,6 +19,7 @@ py_library(
         "save.py",
         "saved_model/base_serialization.py",
         "saved_model/constants.py",
+        "saved_model/json_utils.py",
         "saved_model/layer_serialization.py",
         "saved_model/load.py",
         "saved_model/model_serialization.py",
@@ -164,6 +165,7 @@ tf_py_test(
     size = "medium",
     srcs = ["saved_model/revive_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -171,3 +173,15 @@ tf_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+tf_py_test(
+    name = "json_utils_test",
+    size = "small",
+    srcs = ["saved_model/json_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 602c3cdd359..965a1b88cc7 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -213,6 +213,39 @@ class TestSaveModel(test.TestCase):
                         rnn_layers[1].kernel.name)
     self.assertIn('rnn_cell1', rnn_layers[1].kernel.name)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_optimizer_weights(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.layer = keras.layers.Dense(1)
+
+      def call(self, x):
+        return self.layer(x)
+
+    path = os.path.join(self.get_temp_dir(), 'weights_path')
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+
+    model = MyModel()
+    model.compile('rmsprop', loss='bce')
+    model.train_on_batch(x, y)
+    model.reset_metrics()
+    model.save_weights(path, save_format='tf')
+
+    batch_loss = model.train_on_batch(x, y)
+
+    new_model = MyModel()
+    new_model.compile('rmsprop', loss='bce')
+    new_model.train_on_batch(x, y)
+    new_model.reset_metrics()
+
+    new_model.load_weights(path)
+    new_batch_loss = new_model.train_on_batch(x, y)
+
+    self.assertAllClose(batch_loss, new_batch_loss)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/base_serialization.py b/tensorflow/python/keras/saving/saved_model/base_serialization.py
index 565601a5242..0065e6d786e 100644
--- a/tensorflow/python/keras/saving/saved_model/base_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/base_serialization.py
@@ -19,12 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import json
-
 import six
 
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.training.tracking import tracking
-from tensorflow.python.util import serialization
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -53,9 +51,7 @@ class SavedModelSaver(object):
     """
     # TODO(kathywu): check that serialized JSON can be loaded (e.g., if an
     # object is in the python property)
-    return json.dumps(
-        self.python_properties,
-        default=serialization.get_json_type)
+    return json_utils.Encoder().encode(self.python_properties)
 
   def list_extra_dependencies_for_serialization(self, serialization_cache):
     """Lists extra dependencies to serialize to SavedModel.
diff --git a/tensorflow/python/keras/saving/saved_model/json_utils.py b/tensorflow/python/keras/saving/saved_model/json_utils.py
new file mode 100644
index 00000000000..0ac86d4e692
--- /dev/null
+++ b/tensorflow/python/keras/saving/saved_model/json_utils.py
@@ -0,0 +1,69 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for creating and loading the Layer metadata for SavedModel.
+
+These are required to retain the original format of the build input shape, since
+layers and models may have different build behaviors depending on if the shape
+is a list, tuple, or TensorShape. For example, Network.build() will create
+separate inputs if the given input_shape is a list, and will create a single
+input if the given shape is a tuple.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import serialization
+
+
+class Encoder(json.JSONEncoder):
+  """JSON encoder and decoder that handles TensorShapes and tuples."""
+
+  def default(self, obj):
+    if isinstance(obj, tensor_shape.TensorShape):
+      items = obj.as_list() if obj.rank is not None else None
+      return {'class_name': 'TensorShape', 'items': items}
+    return serialization.get_json_type(obj)
+
+  def encode(self, obj):
+    return super(Encoder, self).encode(_encode_tuple(obj))
+
+
+def _encode_tuple(x):
+  if isinstance(x, tuple):
+    return {'class_name': '__tuple__',
+            'items': tuple(_encode_tuple(i) for i in x)}
+  elif isinstance(x, list):
+    return [_encode_tuple(i) for i in x]
+  elif isinstance(x, dict):
+    return {key: _encode_tuple(value) for key, value in x.items()}
+  else:
+    return x
+
+
+def decode(json_string):
+  return json.loads(json_string, object_hook=_decode_helper)
+
+
+def _decode_helper(obj):
+  if isinstance(obj, dict) and 'class_name' in obj:
+    if obj['class_name'] == 'TensorShape':
+      return tensor_shape.TensorShape(obj['items'])
+    elif obj['class_name'] == '__tuple__':
+      return tuple(_decode_helper(i) for i in obj['items'])
+  return obj
diff --git a/tensorflow/python/keras/saving/saved_model/json_utils_test.py b/tensorflow/python/keras/saving/saved_model/json_utils_test.py
new file mode 100644
index 00000000000..f940279404f
--- /dev/null
+++ b/tensorflow/python/keras/saving/saved_model/json_utils_test.py
@@ -0,0 +1,55 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Tests the JSON encoder and decoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.saving.saved_model import json_utils
+from tensorflow.python.platform import test
+
+
+class JsonUtilsTest(test.TestCase):
+
+  def test_encode_decode_tensor_shape(self):
+    metadata = {
+        'key1': tensor_shape.TensorShape(None),
+        'key2': [tensor_shape.TensorShape([None]),
+                 tensor_shape.TensorShape([3, None, 5])]}
+    string = json_utils.Encoder().encode(metadata)
+    loaded = json_utils.decode(string)
+
+    self.assertEqual(set(loaded.keys()), {'key1', 'key2'})
+    self.assertAllEqual(loaded['key1'].rank, None)
+    self.assertAllEqual(loaded['key2'][0].as_list(), [None])
+    self.assertAllEqual(loaded['key2'][1].as_list(), [3, None, 5])
+
+  def test_encode_decode_tuple(self):
+    metadata = {
+        'key1': (3, 5),
+        'key2': [(1, (3, 4)), (1,)]}
+    string = json_utils.Encoder().encode(metadata)
+    loaded = json_utils.decode(string)
+
+    self.assertEqual(set(loaded.keys()), {'key1', 'key2'})
+    self.assertAllEqual(loaded['key1'], (3, 5))
+    self.assertAllEqual(loaded['key2'], [(1, (3, 4)), (1,)])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/layer_serialization.py b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
index ab1edaab585..6dffcc65c7e 100644
--- a/tensorflow/python/keras/saving/saved_model/layer_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
@@ -68,6 +68,8 @@ class LayerSavedModelSaver(base_serialization.SavedModelSaver):
         hasattr(self.obj.activity_regularizer, 'get_config')):
       metadata['activity_regularizer'] = generic_utils.serialize_keras_object(
           self.obj.activity_regularizer)
+    if self.obj._build_input_shape is not None:  # pylint: disable=protected-access
+      metadata['build_input_shape'] = self.obj._build_input_shape  # pylint: disable=protected-access
     return metadata
 
   def objects_to_serialize(self, serialization_cache):
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index d53530ec1d7..af511e6586a 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
 import re
 
 from tensorflow.python.eager import context
@@ -29,6 +28,7 @@ from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
+from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.saving.saved_model import utils
 from tensorflow.python.keras.saving.saved_model.serialized_attributes import CommonEndpoints
 from tensorflow.python.keras.utils import generic_utils
@@ -40,6 +40,7 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking.tracking import delete_tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 # To avoid circular dependencies between keras/engine and keras/saving,
@@ -164,6 +165,8 @@ class KerasObjectLoader(tf_load.Loader):
     # records all nodes that were generated directly/indirectly from the config,
     # so that they do not get recreated multiple times.
     self._nodes_recreated_from_config = {}
+    self._all_nodes_recreated_from_config = (
+        object_identity.ObjectIdentityWeakSet())
     # Store all node ids that have already been traversed when tracking nodes
     # that were recreated from the config.
     self._traversed_nodes_from_config = []
@@ -227,30 +230,36 @@ class KerasObjectLoader(tf_load.Loader):
       return
     self._traversed_nodes_from_config.append(node_id)
     obj._maybe_initialize_trackable()
+
     for reference in proto.children:
       obj_child = obj._lookup_dependency(reference.local_name)
-      setter = setattr
+      child_id = reference.node_id
+      child_proto = self._proto.nodes[child_id]
+
       if not isinstance(obj_child, trackable.Trackable):
         continue
-      if obj_child._object_identifier in revived_types.registered_identifiers():
-        setter = lambda *unused: None
+      if (child_proto.user_object.identifier in
+          revived_types.registered_identifiers()):
+        setter = revived_types.get_setter(child_proto.user_object)
       elif obj_child._object_identifier in KERAS_OBJECT_IDENTIFIERS:
-        metadata = self._proto.nodes[reference.node_id].user_object.metadata
         setter = _revive_setter
-        _add_serialized_attributes(obj_child, json.loads(metadata))
+      else:
+        setter = setattr
         # pylint: enable=protected-access
-      if (reference.node_id in self._nodes_recreated_from_config and
-          self._nodes_recreated_from_config[reference.node_id][0] is not
-          obj_child):
+
+      if (child_id in self._nodes_recreated_from_config and
+          self._nodes_recreated_from_config[child_id][0] is not obj_child):
         # This means that the same trackable object is referenced by two
         # different objects that were recreated from the config.
         logging.warn('Looks like there is an object (perhaps variable or layer)'
                      ' that is shared between different layers/models. This '
-                     'may cause issues when training the model. Object: {}'
-                     .format(obj_child))
-      self._nodes_recreated_from_config[reference.node_id] = obj_child, setter
+                     'may cause issues when restoring the variable values.'
+                     'Object: {}'.format(obj_child))
+      self._nodes_recreated_from_config[child_id] = (
+          obj_child, self._config_node_setter(setter))
+      self._all_nodes_recreated_from_config.add(obj_child)
       self._add_children_recreated_from_config(
-          obj_child, self._proto.nodes[reference.node_id], reference.node_id)
+          obj_child, child_proto, child_id)
 
   def _load_layers(self):
     layers = {}
@@ -262,19 +271,35 @@ class KerasObjectLoader(tf_load.Loader):
 
   def _load_layer(self, proto, node_id):
     """Load a single layer from a SavedUserObject proto."""
+    metadata = json_utils.decode(proto.metadata)
+
+    # If node was already created
+    if node_id in self._nodes_recreated_from_config:
+      node, setter = self._nodes_recreated_from_config[node_id]
+
+      self._try_build_layer(node, node_id, metadata.get('build_input_shape'))
+
+      # Revive setter requires the object to have a `_serialized_attributes`
+      # property. Add it here.
+      _maybe_add_serialized_attributes(node, metadata)
+
+      config = metadata.get('config')
+      if _is_graph_network(node) and generic_utils.validate_config(config):
+        self.model_layer_dependencies[node_id] = (
+            node, self._get_child_layer_node_ids(node_id, node.name))
+      return node, setter
+
     # Detect whether this object can be revived from the config. If not, then
     # revive from the SavedModel instead.
-    metadata = json.loads(proto.metadata)
     obj, setter = self._revive_from_config(metadata, node_id)
     if obj is None:
       obj, setter = revive_custom_object(proto.identifier, metadata)
 
-    if setter == _revive_setter:
-      # Add an attribute that stores the extra functions/objects saved in the
-      # SavedModel. Most of these functions/objects are ignored, but some are
-      # used later in the loading process (e.g. the list of regularization
-      # losses, or the training config of compiled models).
-      _add_serialized_attributes(obj, metadata)
+    # Add an attribute that stores the extra functions/objects saved in the
+    # SavedModel. Most of these functions/objects are ignored, but some are
+    # used later in the loading process (e.g. the list of regularization
+    # losses, or the training config of compiled models).
+    _maybe_add_serialized_attributes(obj, metadata)
     return obj, setter
 
   def _revive_from_config(self, metadata, node_id):
@@ -284,8 +309,9 @@ class KerasObjectLoader(tf_load.Loader):
     if obj is None:
       return None, None
 
-    setter = _revive_setter
+    setter = self._config_node_setter(_revive_setter)
     self._nodes_recreated_from_config[node_id] = obj, setter
+    self._all_nodes_recreated_from_config.add(obj)
     self._add_children_recreated_from_config(
         obj, self._proto.nodes[node_id], node_id)
     return obj, setter
@@ -300,9 +326,8 @@ class KerasObjectLoader(tf_load.Loader):
     model_is_functional_or_sequential = (
         metadata.get('is_graph_network', False) or
         metadata['class_name'] == 'Sequential')
-    if (config is None or
-        generic_utils.LAYER_UNDEFINED_CONFIG_KEY in config or
-        not model_is_functional_or_sequential):
+    if not (generic_utils.validate_config(config) and
+            model_is_functional_or_sequential):
       return None  # Revive as custom model.
 
     # Revive functional and sequential models as blank model objects for now (
@@ -329,7 +354,7 @@ class KerasObjectLoader(tf_load.Loader):
     #       found.
     class_name = metadata.get('class_name')
     config = metadata.get('config')
-    if config is None or generic_utils.LAYER_UNDEFINED_CONFIG_KEY in config:
+    if not generic_utils.validate_config(config):
       return None
 
     try:
@@ -348,16 +373,31 @@ class KerasObjectLoader(tf_load.Loader):
       obj._set_dtype_policy(metadata['dtype'])
     # pylint: enable=protected-access
 
-    input_shape = None
-    if not isinstance(obj, input_layer.InputLayer):
-      input_shape = self._infer_inputs(node_id, convert_to_shapes=True)
-      if input_shape is None:
-        return None
-    obj.build(input_shape)
-    obj.built = True
+    build_input_shape = metadata.get('build_input_shape')
+    built = self._try_build_layer(obj, node_id, build_input_shape)
+
+    if not built:
+      # If the layer cannot be built, revive a custom layer instead.
+      return None
 
     return obj
 
+  def _try_build_layer(self, obj, node_id, build_input_shape):
+    """Attempts to build the layer."""
+    if obj.built or hasattr(obj.build, '_is_default'):
+      obj.built = True
+      return True
+
+    if build_input_shape is None:
+      build_input_shape = self._infer_inputs(node_id, convert_to_shapes=True)
+
+    if build_input_shape is not None:
+      obj.build(build_input_shape)
+      base_layer.Layer.build(obj, build_input_shape)
+      return True
+
+    return False
+
   def _load_edges(self):
     """Add edges for all nodes that are not waiting on initialization."""
     for node_id, proto in enumerate(self._proto.nodes):
@@ -432,8 +472,8 @@ class KerasObjectLoader(tf_load.Loader):
                        .format(uninitialized_model_names))
 
   def _reconstruct_model(self, model_id, model, layers):
-    config = (
-        json.loads(self._proto.nodes[model_id].user_object.metadata)['config'])
+    config = json_utils.decode(
+        self._proto.nodes[model_id].user_object.metadata)['config']
     if isinstance(model, models_lib.Sequential):
       if not isinstance(layers[0], input_layer.InputLayer):
         if 'batch_input_shape' in config['layers'][0]['config']:
@@ -502,6 +542,14 @@ class KerasObjectLoader(tf_load.Loader):
     else:
       return inputs
 
+  def _config_node_setter(self, setter):
+    """Creates edges for nodes that are recreated from config."""
+    def setattr_wrapper(obj, name, value):
+      # Avoid overwriting attributes of objects recreated from the config.
+      if obj._lookup_dependency(name) is None:  # pylint: disable=protected-access
+        setter(obj, name, value)
+    return setattr_wrapper
+
 
 def _finalize_saved_model_layers(layers):
   """Runs the final steps of loading Keras Layers from SavedModel."""
@@ -626,8 +674,9 @@ class RevivedLayer(object):
     with trackable.no_automatic_dependency_tracking_scope(revived_obj):
       # pylint:disable=protected-access
       revived_obj._expects_training_arg = metadata['expects_training_arg']
-      if metadata.get('config') is not None:
-        revived_obj._config = metadata['config']
+      config = metadata.get('config')
+      if generic_utils.validate_config(config):
+        revived_obj._config = config
       if metadata.get('input_spec') is not None:
         revived_obj.input_spec = recursively_deserialize_keras_object(
             metadata['input_spec'],
@@ -747,8 +796,9 @@ class RevivedNetwork(RevivedLayer):
     with trackable.no_automatic_dependency_tracking_scope(revived_obj):
       # pylint:disable=protected-access
       revived_obj._expects_training_arg = metadata['expects_training_arg']
-      if metadata.get('config') is not None:
-        revived_obj._config = metadata['config']
+      config = metadata.get('config')
+      if generic_utils.validate_config(config):
+        revived_obj._config = config
 
       if metadata.get('activity_regularizer') is not None:
         revived_obj.activity_regularizer = regularizers.deserialize(
@@ -769,12 +819,13 @@ def _set_network_attributes_from_metadata(revived_obj):
     # pylint:enable=protected-access
 
 
-def _add_serialized_attributes(layer, metadata):
+def _maybe_add_serialized_attributes(layer, metadata):
   # Store attributes revived from SerializedAttributes in a un-tracked
   # dictionary. The attributes are the ones listed in CommonEndpoints or
   # "keras_api" for keras-specific attributes.
-  with trackable.no_automatic_dependency_tracking_scope(layer):
-    layer._serialized_attributes = {'metadata': metadata}  # pylint: disable=protected-access
+  if not hasattr(layer, '_serialized_attributes'):
+    with trackable.no_automatic_dependency_tracking_scope(layer):
+      layer._serialized_attributes = {'metadata': metadata}  # pylint: disable=protected-access
 
 
 def _get_keras_attr(layer):
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 3e267340caa..ca3ecfc5a77 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -50,15 +50,18 @@ class SubclassedModelNoConfig(keras.Model):
     self.a = a
     self.b = b
     self.shared = CustomLayerNoConfig(a, b)
-    self.all_layers = [
+    self.all_layers = []
+
+  def build(self, input_shape):
+    self.all_layers.extend([
         self.shared,
-        CustomLayerWithConfig(a + 1, b + 2),
-        CustomLayerNoConfig(a + 3, b + 4),
+        CustomLayerWithConfig(self.a + 1, self.b + 2),
+        CustomLayerNoConfig(self.a + 3, self.b + 4),
         keras.Sequential([
             # TODO(b/145029112): Bug with losses when there are shared layers.
             # self.shared,  <-- Enable when bug is fixed.
-            CustomLayerNoConfig(a + 5, b + 6)
-        ])]
+            CustomLayerNoConfig(self.a + 5, self.b + 6)])])
+    super(SubclassedModelNoConfig, self).build(input_shape)
 
   def call(self, inputs):
     x = inputs
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 9a82f69a2fd..6550e2341ce 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import os
 import six
 
@@ -76,6 +77,7 @@ def model_input_signature(model, keep_original_batch_size=False):
   input_specs = model._get_save_spec(dynamic_batch=not keep_original_batch_size)  # pylint: disable=protected-access
   if input_specs is None:
     return None
+  input_specs = _enforce_names_consistency(input_specs)
   # Return a list with a single element as the model's input signature.
   if isinstance(input_specs, collections.Sequence) and len(input_specs) == 1:
     # Note that the isinstance check filters out single-element dictionaries,
@@ -279,3 +281,25 @@ def _deserialize_metric(metric_config):
     # case handling for these in compile, based on model output shape.
     return metric_config
   return metrics_module.deserialize(metric_config)
+
+
+def _enforce_names_consistency(specs):
+  """Enforces that either all specs have names or none do."""
+
+  def _has_name(spec):
+    return hasattr(spec, 'name') and spec.name is not None
+
+  def _clear_name(spec):
+    spec = copy.deepcopy(spec)
+    if hasattr(spec, 'name'):
+      spec._name = None  # pylint:disable=protected-access
+    return spec
+
+  flat_specs = nest.flatten(specs)
+  name_inconsistency = (
+      any(_has_name(s) for s in flat_specs) and
+      not all(_has_name(s) for s in flat_specs))
+
+  if name_inconsistency:
+    specs = nest.map_structure(_clear_name, specs)
+  return specs
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index edbfed6d776..bbb6155e30e 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -44,7 +44,7 @@ _GLOBAL_CUSTOM_NAMES = {}
 _SKIP_FAILED_SERIALIZATION = False
 # If a layer does not have a defined config, then the returned config will be a
 # dictionary with the below key.
-LAYER_UNDEFINED_CONFIG_KEY = 'layer was saved without config'
+_LAYER_UNDEFINED_CONFIG_KEY = 'layer was saved without config'
 
 
 @keras_export('keras.utils.CustomObjectScope')
@@ -271,7 +271,7 @@ def serialize_keras_object(instance):
     except NotImplementedError as e:
       if _SKIP_FAILED_SERIALIZATION:
         return serialize_keras_class_and_config(
-            name, {LAYER_UNDEFINED_CONFIG_KEY: True})
+            name, {_LAYER_UNDEFINED_CONFIG_KEY: True})
       raise e
     serialization_config = {}
     for key, item in config.items():
@@ -756,12 +756,6 @@ def to_list(x):
   return [x]
 
 
-def object_list_uid(object_list):
-  """Creates a single string from object ids."""
-  object_list = nest.flatten(object_list)
-  return ', '.join(str(abs(id(x))) for x in object_list)
-
-
 def to_snake_case(name):
   intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
   insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
@@ -796,3 +790,8 @@ def validate_kwargs(kwargs,
   for kwarg in kwargs:
     if kwarg not in allowed_kwargs:
       raise TypeError(error_message, kwarg)
+
+
+def validate_config(config):
+  """Determines whether config appears to be a valid layer config."""
+  return isinstance(config, dict) and _LAYER_UNDEFINED_CONFIG_KEY not in config
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index 5687d2a1c4b..e81058e3b70 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -119,3 +119,30 @@ def scale_loss_for_distribution(loss_value):
   if num_replicas > 1:
     loss_value *= (1. / num_replicas)
   return loss_value
+
+
+def cast_losses_to_common_dtype(losses):
+  """Cast a list of losses to a common dtype.
+
+  If any loss is floating-point, they will all be casted to the most-precise
+  floating-point loss. Otherwise the losses are not casted. We also skip casting
+  losses if there are any complex losses.
+
+  Args:
+    losses: A list of losses.
+
+  Returns:
+    `losses`, but they have been casted to a common dtype.
+  """
+  highest_float = None
+  for loss in losses:
+    if loss.dtype.is_floating:
+      if highest_float is None or loss.dtype.size > highest_float.size:
+        highest_float = loss.dtype
+      elif {loss.dtype, highest_float} == {'bfloat16', 'float16'}:
+        highest_float = 'float32'
+    if loss.dtype.is_complex:
+      return losses  # If we find any complex losses, do not cast any losses
+  if highest_float:
+    losses = [math_ops.cast(loss, highest_float) for loss in losses]
+  return losses
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 57b5c605db9..cf7240f7e3a 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -210,6 +210,10 @@ def convert_shapes(input_shape, to_tuples=True):
 
   Returns:
     Nested structure of shapes in desired format.
+
+  Raises:
+    ValueError: when the input tensor shape can't be converted to tuples, eg
+      unknown tensor shape.
   """
 
   def _is_shape_component(value):
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index 7b5bd824e26..9a18c1c3bd9 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -33,16 +33,19 @@ class OneHotTest(test.TestCase):
                   use_gpu=False,
                   expected_err_re=None,
                   raises=None,
+                  dtype=None,
                   **inputs):
     with self.cached_session(use_gpu=use_gpu):
       if raises is not None:
         with self.assertRaises(raises):
-          array_ops.one_hot(**inputs)
+          array_ops.one_hot(dtype=dtype, **inputs)
       else:
-        ans = array_ops.one_hot(**inputs)
+        ans = array_ops.one_hot(dtype=dtype, **inputs)
         if expected_err_re is None:
           tf_ans = self.evaluate(ans)
           self.assertAllEqual(tf_ans, truth)
+          if dtype:
+            self.assertEqual(tf_ans.dtype, dtype)
           self.assertEqual(tf_ans.shape, ans.get_shape())
         else:
           with self.assertRaisesOpError(expected_err_re):
@@ -91,13 +94,16 @@ class OneHotTest(test.TestCase):
         dtype=dtype)
 
     # axis == -1
-    self._testBothOneHot(indices=indices, depth=depth, truth=truth)
+    self._testBothOneHot(indices=indices, depth=depth, dtype=dtype, truth=truth)
 
     # axis == 0
     self._testBothOneHot(
-        indices=indices, depth=depth, axis=0,
+        indices=indices, depth=depth, axis=0, dtype=dtype,
         truth=truth.T)  # Output is transpose version in this case
 
+  def testDefaultNoDtype(self):
+    self._testDefaultBasic(None)
+
   def testFloatBasic(self):
     self._testBasic(np.float32)
     self._testDefaultBasic(np.float32)
@@ -303,7 +309,6 @@ class OneHotTest(test.TestCase):
         depth=depth,
         on_value=on_value,
         off_value=off_value,
-        dtype=dtypes.string,
         truth=truth)
 
     on_value = constant_op.constant(b"1.0")
@@ -313,7 +318,6 @@ class OneHotTest(test.TestCase):
         depth=depth,
         on_value=on_value,
         off_value=off_value,
-        dtype=dtypes.string,
         truth=truth)
 
     on_value = b"1.0"
@@ -323,7 +327,6 @@ class OneHotTest(test.TestCase):
         depth=depth,
         on_value=on_value,
         off_value=off_value,
-        dtype=dtypes.string,
         truth=truth)
 
   def testIndicesTypes(self):
@@ -400,8 +403,8 @@ class OneHotTest(test.TestCase):
   def testDtypeMismatchTypeError(self):
     indices = [0, 1, 2]
     depth = 3
-    on_value = np.asarray(1.0, np.float32)
-    off_value = np.asarray(0.0, np.float32)
+    on_value = constant_op.constant(1.0, dtypes.float32)
+    off_value = constant_op.constant(0.0, dtypes.float32)
     dtype = np.int32
 
     self._testBothOneHot(
@@ -420,6 +423,37 @@ class OneHotTest(test.TestCase):
         truth=None,
         raises=TypeError)
 
+  def testConvertToTensorOfCorrectDtype(self):
+    indices = [0, 1, 2]
+    depth = 3
+    dtype = np.float16
+    truth = np.asarray([[1, 0, 0],
+                        [0, 1, 0],
+                        [0, 0, 1]])
+    self._testBothOneHot(
+        truth=truth,
+        indices=indices,
+        depth=depth,
+        on_value=1.0,
+        off_value=constant_op.constant(0.0, dtype),
+        dtype=dtype)
+
+    self._testBothOneHot(
+        truth=truth,
+        indices=indices,
+        depth=depth,
+        on_value=constant_op.constant(1.0, dtype),
+        off_value=0.,
+        dtype=dtype)
+
+    self._testBothOneHot(
+        truth=truth,
+        indices=indices,
+        depth=depth,
+        on_value=1.0,
+        off_value=0.,
+        dtype=dtype)
+
   def testOneHotUint8WithLargeArray(self):
     with self.cached_session(use_gpu=False) as sess:
       matrix = np.random.rand(256) * 10
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 566bbb56f00..5037f82af72 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -23,8 +23,10 @@ import numpy
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -410,6 +412,52 @@ class SparseCrossOpTest(test.TestCase):
         constant_op.constant(values, value_type, [len(indices)]),
         constant_op.constant(shape, dtypes.int64))
 
+  def test_invalid_sparse_tensors(self):
+    # Test validation of invalid SparseTensors.  The SparseTensor constructor
+    # prevents us from creating invalid SparseTensors (eps. in eager mode),
+    # so we create valid SparseTensors and then modify them to be invalid.
+
+    st1 = sparse_tensor.SparseTensor([[0, 0]], [0], [2, 2])
+    st1._indices = array_ops.zeros([], dtypes.int64)
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Input indices should be a matrix'):
+      self.evaluate(sparse_ops.sparse_cross([st1]))
+
+    st2 = sparse_tensor.SparseTensor([[0, 0]], [0], [2, 2])
+    st2._values = array_ops.zeros([], dtypes.int64)
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Input values should be a vector'):
+      self.evaluate(sparse_ops.sparse_cross([st2]))
+
+    st3 = sparse_tensor.SparseTensor([[0, 0]], [0], [2, 2])
+    st3._dense_shape = array_ops.zeros([], dtypes.int64)
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Input shapes should be a vector'):
+      self.evaluate(sparse_ops.sparse_cross([st3]))
+
+  def test_bad_tensor_shapes(self):
+    # All inputs must be 2D.
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Expected D2 of index to be 2'):
+      st = sparse_tensor.SparseTensor([[0]], [0], [10])  # 1D SparseTensor
+      self.evaluate(sparse_ops.sparse_cross([st]))
+
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Dense inputs should be a matrix'):
+      dt = array_ops.zeros([0])  # 1D DenseTensor.
+      self.evaluate(sparse_ops.sparse_cross([dt]))
+
+  def test_batch_size_mismatch(self):
+    st1 = sparse_tensor.SparseTensor([[0, 0]], [0], [10, 10])  # batch size 10
+    st2 = sparse_tensor.SparseTensor([[0, 0]], [0], [7, 10])  # batch size 7
+    dt = array_ops.zeros([5, 0])  # batch size 5
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Expected batch size'):
+      self.evaluate(sparse_ops.sparse_cross([st1, dt]))
+    with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
+                                 'Expected batch size'):
+      self.evaluate(sparse_ops.sparse_cross([st1, st2]))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 4585d39e592..fea6fe123ad 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -28,6 +28,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:tpu_values",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 7fa4fc14d7f..b2fc4ff9645 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -249,10 +250,8 @@ class VariableTrackingTest(test_util.TensorFlowTestCase):
   def test_supports_distributed_variables(self):
     mirrored = distributed_values.MirroredVariable(
         None, [variables.Variable(1.)], variables.VariableAggregation.SUM)
-    tpu = distributed_values.TPUMirroredVariable(
-        strategy=None,
-        values=[variables.Variable(42.)],
-        aggregation=None)
+    tpu = tpu_values.TPUMirroredVariable(
+        strategy=None, values=[variables.Variable(42.)], aggregation=None)
     aggregating = distributed_values.AggregatingVariable(
         strategy=None, v=variables.Variable(1.), aggregation=None)
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4b805a64d36..50afcfbc6e0 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3949,11 +3949,13 @@ def one_hot(indices,
     on_exists = on_value is not None
     off_exists = off_value is not None
 
-    on_dtype = (
-        ops.convert_to_tensor(on_value).dtype.base_dtype if on_exists else None)
-    off_dtype = (
-        ops.convert_to_tensor(off_value).dtype.base_dtype
-        if off_exists else None)
+    if on_exists:
+      on_value = ops.convert_to_tensor(on_value, dtype_hint=dtype)
+    if off_exists:
+      off_value = ops.convert_to_tensor(off_value, dtype_hint=dtype)
+
+    on_dtype = on_value.dtype.base_dtype if on_exists else None
+    off_dtype = off_value.dtype.base_dtype if off_exists else None
 
     if on_exists or off_exists:
       if dtype is not None:
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 194889c1ad5..4a181d72f2a 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -751,14 +751,11 @@ class LinearOperator(module.Module):
     with self._name_scope(name):
       return self._log_abs_determinant()
 
-  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    """Default implementation of _solve."""
-    if self.is_square is False:
+  def _dense_solve(self, rhs, adjoint=False, adjoint_arg=False):
+    """Solve by conversion to a dense matrix."""
+    if self.is_square is False:  # pylint: disable=g-bool-id-comparison
       raise NotImplementedError(
           "Solve is not yet implemented for non-square operators.")
-    logging.warn(
-        "Using (possibly slow) default implementation of solve."
-        "  Requires conversion to a dense matrix and O(N^3) operations.")
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     if self._can_use_cholesky():
       return linalg_ops.cholesky_solve(
@@ -766,6 +763,13 @@ class LinearOperator(module.Module):
     return linear_operator_util.matrix_solve_with_broadcast(
         self.to_dense(), rhs, adjoint=adjoint)
 
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    """Default implementation of _solve."""
+    logging.warn(
+        "Using (possibly slow) default implementation of solve."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    return self._dense_solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
     """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 8fe68919250..8d92d1accaa 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -183,5 +183,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     return math_ops.matmul(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self._dense_solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
   def _to_dense(self):
     return self._matrix
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 231bb5e4cc6..9395016bd20 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -4668,3 +4668,27 @@ def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
   """
   with ops.name_scope(name, "sobol", [dim, num_results, skip]):
     return gen_math_ops.sobol_sample(dim, num_results, skip, dtype=dtype)
+
+
+@tf_export("math.rsqrt", v1=["math.rsqrt", "rsqrt"])
+@deprecation.deprecated_endpoints("rsqrt")
+@dispatch.add_dispatch_support
+def rsqrt(x, name=None):
+  """Computes reciprocal of square root of x element-wise.
+
+  For example:
+
+  >>> x = tf.constant([2., 0., -2.])
+  >>> tf.math.rsqrt(x)
+  <tf.Tensor: shape=(3,), dtype=float32,
+  numpy=array([0.707, inf, nan], dtype=float32)>
+
+  Args:
+    x: A `tf.Tensor`. Must be one of the following types: `bfloat16`, `half`,
+      `float32`, `float64`. `int32`
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tf.Tensor`. Has the same type as `x`.
+  """
+  return gen_math_ops.rsqrt(x, name)
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 88ddf7a7ec8..6bc33e10a23 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -109,6 +109,8 @@ cuda_py_test(
     name = "control_flow_ops_test",
     srcs = ["control_flow_ops_test.py"],
     tags = ["no_rocm"],
+    # TODO(b/149957923): The test is flaky
+    xla_enable_strict_auto_jit = False,
     deps = [
         ":control_flow_ops",
         ":test_util",
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 5b0f19358fa..7a4bfef154f 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -539,8 +539,9 @@ def _ragged_op_signature(op, ragged_args):
     arg_names[pos] = '**' + arg_names[pos] + '**'
 
   # Add argument defaults.
-  for pos in range(-1, -len(argspec.defaults) - 1, -1):
-    arg_names[pos] += '=`{!r}`'.format(argspec.defaults[pos])
+  if argspec.defaults is not None:
+    for pos in range(-1, -len(argspec.defaults) - 1, -1):
+      arg_names[pos] += '=`{!r}`'.format(argspec.defaults[pos])
 
   # Add varargs and keyword args
   if argspec.varargs:
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index d5a508a08be..493e5b97cd6 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -457,8 +457,8 @@ def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable
   """Split elements of `input` based on `sep` into a `RaggedTensor`.
 
   Let N be the size of `input` (typically N will be the batch size). Split each
-  element of `input` based on `sep` and return a `SparseTensor` or
-  `RaggedTensor` containing the split tokens. Empty tokens are ignored.
+  element of `input` based on `sep` and return a `RaggedTensor` containing the 
+  split tokens. Empty tokens are ignored.
 
   Example:
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index bee27d1ca9f..dcfa4d1ef1a 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
 import numbers
 import os
 import re
@@ -379,6 +380,16 @@ class TensorFlowBenchmark(Benchmark):
       lm1 = l - 1
       return (s[l//2] + s[lm1//2]) / 2.0
 
+    def _mean_and_stdev(x):
+      if not x:
+        return -1, -1
+      l = len(x)
+      mean = sum(x) / l
+      if l == 1:
+        return mean, -1
+      variance = sum([(e - mean) * (e - mean) for e in x]) / (l - 1)
+      return mean, math.sqrt(variance)
+
     median_delta = _median(deltas)
 
     benchmark_values = {
@@ -389,6 +400,10 @@ class TensorFlowBenchmark(Benchmark):
         "throughput": mbs / median_delta
     }
     self.report_benchmark(**benchmark_values)
+
+    mean_delta, stdev_delta = _mean_and_stdev(deltas)
+    unreported_extras["wall_time_mean"] = mean_delta
+    unreported_extras["wall_time_stdev"] = stdev_delta
     benchmark_values["extras"].update(unreported_extras)
     return benchmark_values
 
diff --git a/tensorflow/python/saved_model/revived_types.py b/tensorflow/python/saved_model/revived_types.py
index a802cdbe3ec..32d0b8ae53e 100644
--- a/tensorflow/python/saved_model/revived_types.py
+++ b/tensorflow/python/saved_model/revived_types.py
@@ -169,3 +169,13 @@ def deserialize(proto):
 
 def registered_identifiers():
   return _REVIVED_TYPE_REGISTRY.keys()
+
+
+def get_setter(proto):
+  _, type_registrations = _REVIVED_TYPE_REGISTRY.get(
+      proto.identifier, (None, None))
+  if type_registrations is not None:
+    for type_registration in type_registrations:
+      if type_registration.should_load(proto):
+        return type_registration.setter
+  return None
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 7be093a1340..48a5da2b0e2 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -473,13 +473,19 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
     return output;
   });
-  m.def("TFE_ContextClearRemoteExecutors", [](py::handle& ctx) {
+  m.def("TFE_ContextSyncExecutors", [](py::handle& ctx) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
-    TFE_ContextClearRemoteExecutors(tensorflow::InputTFE_Context(ctx),
-                                    status.get());
+    TFE_ContextAsyncWait(tensorflow::InputTFE_Context(ctx), status.get());
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
+  m.def("TFE_ContextClearExecutors", [](py::handle& ctx) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextAsyncWait(tensorflow::InputTFE_Context(ctx), status.get());
+    // NOTE: different from TFE_ContextSyncExecutors that raises potential
+    // errors, deliberately ignore executor statuses in cleanup.
+  });
 
   // TFE_Executor logic
   m.def(
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index a4ca0765f08..a073d49c666 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -1139,6 +1139,8 @@ class TPUEmbedding(object):
         'table_ids': [],
         'max_sequence_lengths': [],
     }
+    int_zeros = array_ops.zeros((0,), dtype=dtypes.int64)
+    float_zeros = array_ops.zeros((0,), dtype=dtypes.float32)
     for table_id, table in enumerate(self._table_to_features_dict):
       features = self._table_to_features_dict[table]
       for feature in features:
@@ -1146,13 +1148,11 @@ class TPUEmbedding(object):
 
         kwargs['sample_indices'].append(
             enqueue_data.sample_indices
-            if enqueue_data.sample_indices is not None else array_ops.zeros(
-                (0,), dtype=dtypes.int64))
+            if enqueue_data.sample_indices is not None else int_zeros)
 
         kwargs['aggregation_weights'].append(
             enqueue_data.aggregation_weights if
-            enqueue_data.aggregation_weights is not None else array_ops.zeros(
-                (0,), dtype=dtypes.float32))
+            enqueue_data.aggregation_weights is not None else float_zeros)
 
         kwargs['embedding_indices'].append(enqueue_data.embedding_indices)
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 035c416d793..a6ca3c6fda8 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -761,7 +761,7 @@ class DeprecatedArgValuesTest(test.TestCase):
       deprecation.deprecated_arg_values(date, None, deprecated=True)
     with self.assertRaisesRegexp(ValueError, "instructions"):
       deprecation.deprecated_arg_values(date, "", deprecated=True)
-    with self.assertRaisesRegexp(ValueError, "argument", deprecated=True):
+    with self.assertRaisesRegexp(ValueError, "argument"):
       deprecation.deprecated_arg_values(date, instructions)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.cc b/tensorflow/stream_executor/gpu/asm_compiler.cc
index 2d2ed0319e1..ba095b733ad 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@@ -179,14 +179,14 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
   if (!env->LocalTempFilename(&ptx_path)) {
     return port::InternalError("couldn't get temp PTX file name");
   }
-  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
-    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
-  });
-
   TF_RETURN_IF_ERROR(
       tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
   VLOG(2) << "ptx written to: " << ptx_path;
 
+  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
+    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
+  });
+
   // Invoke ptxas and collect its output.
   string cubin_path;
   if (!env->LocalTempFilename(&cubin_path)) {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
index b487626520a..30cfac0830c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -9,7 +9,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'8888\', \'1\', \'1\', \'None\', \'True\', \'grpc\'], "
+    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'8888\', \'None\', \'None\', \'None\', \'True\', \'grpc\'], "
   }
   member_method {
     name: "cluster_spec"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 440e6c8a5c4..a292da4a98f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -295,7 +295,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -307,7 +307,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index eee65bc6db4..6d4d44fabd1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -312,7 +312,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -324,7 +324,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index c64a1881f88..ff2e1748ecb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -296,7 +296,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -308,7 +308,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 238701103f7..6ff66cc2430 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -296,7 +296,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -308,7 +308,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index c4049240358..de249627514 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -38,7 +38,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 788efce0063..fe259aba368 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -295,7 +295,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -307,7 +307,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 6166b16f964..d6b323575f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -312,7 +312,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -324,7 +324,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-values.pbtxt
new file mode 100644
index 00000000000..f1729ac0526
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-values.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distribute.DistributedValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.values.DistributedValues\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
index b487626520a..30cfac0830c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -9,7 +9,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'8888\', \'1\', \'1\', \'None\', \'True\', \'grpc\'], "
+    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'8888\', \'None\', \'None\', \'None\', \'True\', \'grpc\'], "
   }
   member_method {
     name: "cluster_spec"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index e5e9424463a..3e226fd8e70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "CrossDeviceOps"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DistributedValues"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "HierarchicalCopyAllReduce"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
index 6616ba97823..5eed1aa7d0a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.compiler.tensorrt.trt_convert.TrtConversionParams\'>"
   is_instance: "<class \'tensorflow.python.compiler.tensorrt.trt_convert.TrtConversionParams\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "allow_build_at_runtime"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "is_dynamic_op"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 440e6c8a5c4..a292da4a98f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -295,7 +295,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -307,7 +307,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index eee65bc6db4..6d4d44fabd1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -312,7 +312,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -324,7 +324,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index c64a1881f88..ff2e1748ecb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -296,7 +296,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -308,7 +308,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 238701103f7..6ff66cc2430 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -296,7 +296,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -308,7 +308,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index c4049240358..de249627514 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -38,7 +38,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 788efce0063..fe259aba368 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -295,7 +295,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -307,7 +307,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 6166b16f964..d6b323575f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -312,7 +312,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "to_json"
@@ -324,7 +324,7 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
   member_method {
     name: "with_name_scope"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010
index 5f07f3adb70..e4aac64a864 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010
@@ -63,11 +63,23 @@ RUN apt-get update && apt-get install -y \
 
 RUN /install/install_pip_packages.sh
 
+# Install python 3.8.
+RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
+RUN rm -rf /var/lib/apt/lists/*
+# Have to download get-pip.py due to a pip circular issue
+# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3.8 get-pip.py
+RUN python3.8 -m pip install --upgrade pip setuptools wheel
+
 # TODO(klimek): Figure out a better way to get the right include paths
 # forwarded when we install new packages.
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
index bf65772c33a..b529147e57e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
@@ -56,6 +56,7 @@ RUN /install/install_latest_clang.sh
 RUN /install/install_bazel.sh
 
 # Install python 3.6.
+RUN apt-get install --reinstall python3-apt
 RUN yes "" | add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update && apt-get install -y \
     python3.6 python3.6-dev python3-pip python3.6-venv && \
@@ -65,11 +66,24 @@ RUN apt-get update && apt-get install -y \
 
 RUN /install/install_pip_packages.sh
 
+# Install python 3.8.
+RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
+RUN rm -rf /var/lib/apt/lists/*
+# Have to download get-pip.py due to a pip circular issue
+# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3.8 get-pip.py
+RUN python3.8 -m pip install --upgrade pip setuptools wheel
+
+
 # TODO(klimek): Figure out a better way to get the right include paths
 # forwarded when we install new packages.
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
index de6a766a7c4..516129ccd43 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -64,6 +64,15 @@ RUN apt-get update && apt-get install -y \
 
 RUN /install/install_pip_packages.sh
 
+# Install python 3.8.
+RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
+RUN rm -rf /var/lib/apt/lists/*
+# Have to download get-pip.py due to a pip circular issue
+# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3.8 get-pip.py
+RUN python3.8 -m pip install --upgrade pip setuptools wheel
+
 # TODO(klimek): Figure out a better way to get the right include paths
 # forwarded when we install new packages.
 RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
@@ -71,3 +80,6 @@ RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-lin
 
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index 30ea2846d08..925933d1f5e 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -15,16 +15,17 @@
 #
 # ==============================================================================
 
+# This script accepts only one parameter: either the word "eigen", or an
+# integer value greater than 0 that is passed to the bazel test command
+# via the OMP_NUM_THREADS action environment variable. If an integer is
+# passed, the script assumes it is running in DNNL mode; the
+# OMP_NUM_THREADS variable is irrelevant in eigen mode.
+
 set -e
 set -x
-MODE=${MODE:-"mkl"}
-OMP_NUM_THREADS=${OMP_NUM_THREADS:-""}
-
-echo ""
-echo "MODE:${MODE}"
-echo "OMP_NUM_THREADS:${OMP_NUM_THREADS}"
-echo ""
 
+DEFAULT_OMP_NUM_THREADS="10"
+DEFAULT_CONFIG="--config=mkl"
 
 N_JOBS=$(grep -c ^processor /proc/cpuinfo)
 
@@ -36,28 +37,51 @@ echo ""
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
-if [[ "$MODE" == "eigen" ]]; then
+
+# Get parameters from command-line rather than from env
+# Setting OMP_THREADS for low performing benchmarks.
+#   Default value(=core count) degrades performance of some benchmark cases.
+#   Optimal thread count is case specific.
+RE_DIGITS_ONLY="^[0-9]+$"
+MIN_OMP_THREADS=1
+if [[ $# -ge 1 ]]; then
+  if [[ "$1" == "eigen" ]]; then
     CONFIG=""
     OMPTHREADS=""
-else
-    CONFIG="--config=mkl"
-# Setting OMP_THREADS for low performing benchmarks.
-#   Default value(=core count) degrades performance of some benchmark cases. 
-#   Optimal thread count is case specific. 
-#   An argument can be passed to script, the value of which is used if given.
-#   Otherwise OMP_NUM_THREADS is set to 10
-    if [[ -z $OMP_NUM_THREADS ]]; then
-        OMPTHREADS="--action_env=OMP_NUM_THREADS=10"
-    else 
-        OMPTHREADS="--action_env=OMP_NUM_THREADS=$OMP_NUM_THREADS"
-    fi
+  elif [[ "$1" =~ ${RE_DIGITS_ONLY} && $1 -ge MIN_OMP_THREADS ]]; then
+    CONFIG="${DEFAULT_CONFIG}"
+    OMPTHREADS="--action_env=OMP_NUM_THREADS=${1}"
+  else
+    echo "${1} isn't a valid configuration or"
+    echo "number of OM_NUM_THREADS. Exiting..."
+    exit 1
+  fi
+else # No parameters were passed in so set default values
+  CONFIG="${DEFAULT_CONFIG}"
+  OMPTHREADS="--action_env=OMP_NUM_THREADS=${DEFAULT_OMP_NUM_THREADS}"
 fi
 
+echo ""
+echo "Bazel will test with CONFIG=${CONFIG} and OMPTHREADS=${OMPTHREADS}"
+echo ""
+
 # Run bazel test command. Double test timeouts to avoid flakes.
 # Setting KMP_BLOCKTIME to 0 lets OpenMP threads to sleep right after parallel execution
 # in an MKL primitive. This reduces the effects of an oversubscription of OpenMP threads
 # caused by executing multiple tests concurrently.
-bazel test --test_tag_filters=-no_oss,-no_oss_py2,-oss_serial,-gpu,-tpu,-benchmark-test --test_lang_filters=cc,py -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
-    ${CONFIG} --test_env=KMP_BLOCKTIME=0 ${OMPTHREADS} --config=opt --test_output=errors -- \
-    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... -//tensorflow/lite/...
+bazel test \
+    --test_tag_filters=-no_oss,-no_oss_py2,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only \
+    --test_lang_filters=cc,py \
+    -k \
+    --jobs=${N_JOBS} \
+    --test_timeout 300,450,1200,3600 \
+    --build_tests_only \
+    ${CONFIG} \
+    --test_env=KMP_BLOCKTIME=0 \
+    ${OMPTHREADS} \
+    --config=opt \
+    --test_output=errors \
+    -- \
+    //tensorflow/... \
+    -//tensorflow/compiler/... \
+    -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
index 92acb7ab7fe..0885d208f1a 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py2_cc/build.sh
@@ -62,9 +62,7 @@ function run_build () {
 }
 
 source tensorflow/tools/ci_build/release/common.sh
-update_bazel_macos
-which bazel
-set_bazel_outdir
+install_bazelisk
 
 setup_pip
 run_build
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
index ffc823a6e2e..658432af36d 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
@@ -60,9 +60,7 @@ function run_build () {
 }
 
 source tensorflow/tools/ci_build/release/common.sh
-update_bazel_macos
-which bazel
-set_bazel_outdir
+install_bazelisk
 
 setup_pip
 run_build
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
index 48c7341c2fc..7a1fdfdb069 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
@@ -55,9 +55,8 @@ function run_build () {
     --config=rbe \
     --python_path="${PYTHON_BIN_PATH}" \
     --action_env=PATH="${ACTION_PATH}" \
-    --action_env=PYTHON_BIN_PATH="${PYTHON_BIN_PATH}" \
     --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-    --action_env=TF_PYTHON_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py3 \
+    --action_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-manylinux2010-py3_config_python" \
     --action_env=TF_ENABLE_XLA=1 \
     --test_tag_filters="${tag_filters}" \
     --build_tag_filters="${tag_filters}" \
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 2f111694dd2..9e4a59ccab3 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -67,7 +67,7 @@ function install_bazelisk {
   esac
   mkdir -p "$HOME/bin"
   wget --no-verbose -O "$HOME/bin/bazel" \
-      "https://github.com/bazelbuild/bazelisk/releases/download/v1.2.1/$name"
+      "https://github.com/bazelbuild/bazelisk/releases/download/v1.3.0/$name"
   chmod u+x "$HOME/bin/bazel"
   if [[ ! ":$PATH:" =~ :"$HOME"/bin/?: ]]; then
     PATH="$HOME/bin:$PATH"
@@ -103,25 +103,6 @@ function update_bazel_linux {
 # LINT.ThenChange(
 #   //tensorflow_estimator/google/kokoro/common.sh)
 
-# Install the given bazel version on macos
-function update_bazel_macos {
-  if [[ -z "$1" ]]; then
-    BAZEL_VERSION=${LATEST_BAZEL_VERSION}
-  else
-    BAZEL_VERSION=$1
-  fi
-  BAZEL_COMMAND="curl -L https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-darwin-x86_64.sh -O && \
-  chmod +x bazel-*.sh && ./bazel-${BAZEL_VERSION}-installer-darwin-x86_64.sh --user && \
-  rm -f bazel-${BAZEL_VERSION}-installer-darwin-x86_64.sh"
-  # If the bazel update fails retry again in 60 seconds.
-  run_with_retry "${BAZEL_COMMAND}"
-  # Add new bazel installation to path
-  PATH="/Users/kbuilder/bin:$PATH"
-  set_bazel_outdir
-  which bazel
-  bazel version
-}
-
 function install_pip2 {
   curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
   sudo python2 get-pip.py
@@ -153,6 +134,7 @@ function install_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install astunparse==1.6.3
   ${SUDO_CMD} ${PIP_CMD} install keras_preprocessing==1.1.0 --no-deps
   "${PIP_CMD}" install numpy==1.16.0 --user
+  "${PIP_CMD}" install PyYAML==3.12 --user
   ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
   ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
   ${SUDO_CMD} ${PIP_CMD} install six==1.12.0
@@ -193,6 +175,7 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install portpicker --user
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
+  "${PIP_CMD}" install PyYAML==3.12 --user
   "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
   # LINT.ThenChange(:ubuntu_pip_installations)
@@ -273,7 +256,7 @@ function copy_to_new_project_name {
   ORIGINAL_WHL_DIR_PREFIX="${ORIGINAL_PROJECT_NAME}-${VERSION}"
   NEW_WHL_DIR_PREFIX="${NEW_PROJECT_NAME}-${VERSION}"
   mv "${ORIGINAL_WHL_DIR_PREFIX}.dist-info" "${NEW_WHL_DIR_PREFIX}.dist-info"
-  mv "${ORIGINAL_WHL_DIR_PREFIX}.data" "${NEW_WHL_DIR_PREFIX}.data"
+  mv "${ORIGINAL_WHL_DIR_PREFIX}.data" "${NEW_WHL_DIR_PREFIX}.data" || echo
   sed -i.bak "s/${ORIGINAL_PROJECT_NAME}/${NEW_PROJECT_NAME}/g" "${NEW_WHL_DIR_PREFIX}.dist-info/RECORD"
 
   ORIGINAL_PROJECT_NAME_DASH="${ORIGINAL_PROJECT_NAME//_/-}"
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh b/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
index 75a2f98d0fc..348778b5f15 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
@@ -18,7 +18,7 @@ echo "bazel clean --expunge" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
 
 # Install latest bazel
 source tensorflow/tools/ci_build/release/common.sh
-update_bazel_macos
+install_bazelisk
 
 # Pick a version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh b/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
index ff17fd2ccaa..ccc80e1bafd 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
@@ -19,5 +19,5 @@ set -x
 
 # Install latest bazel
 source tensorflow/tools/ci_build/release/common.sh
-update_bazel_macos
+install_bazelisk
 tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
index ef951b73505..69c57179379 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-
-set_bazel_outdir
+install_bazelisk
 
 # Pick a version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
index e3b74060823..1a0cdd26d55 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
index 8f06f974619..7a4fb54e250 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
index 3744559a988..f6de18d81ac 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
index 7309e3cb35c..c64d9c00787 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Install pip dependencies
 install_macos_pip_deps sudo
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
index d98abde9447..367bc172ba0 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
@@ -17,10 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
+install_bazelisk
 
 # Pick a more recent version of xcode
 sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
index 8094e58aa09..1f018136ef9 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-
-set_bazel_outdir
+install_bazelisk
 
 # Pick a version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
index d821656ba12..63b614dd687 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
index 80768a15268..f045e7103e0 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
index 4559c189616..8c9b91dd55e 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
index c17b4c788cc..e03f4c4ce2f 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Install pip dependencies
 sudo pip3.5 install --upgrade pip
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
index 2d3f637aa55..8ee43fb1b2f 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
@@ -16,10 +16,7 @@
 set -e
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-install_macos_pip_deps
+install_bazelisk
 
 # For python3 path on Mac
 export PATH=$PATH:/usr/local/bin
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
index 66f182e9091..3702ec97413 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-
-set_bazel_outdir
+install_bazelisk
 
 # Pick a version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
index 93205f8a60d..a80cdd88ddc 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
index 33363c56cdd..01e95c37bae 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
index 0ae2c3b4069..a66dca3885e 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
index c96fce078e2..dc153b16a43 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Install pip dependencies
 install_macos_pip_deps sudo pip3.6
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
index 847a5966145..eee97f6e2d2 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nightly_release.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-
-set_bazel_outdir
+install_bazelisk
 
 # Pick a version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
index f1afa600f0f..e7234024ca5 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
index 965fba0e3d2..45d61222726 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
index 2d5fb071913..5d75224a45c 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
index ab9b7c3219b..afe933a1912 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Install pip dependencies
 install_macos_pip_deps sudo pip3.7
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
index 895059cec5b..7465838abb9 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-
-set_bazel_outdir
+install_bazelisk
 
 install_macos_pip_deps sudo pip3.7
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
index de2a0476667..70773c1b597 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
@@ -17,12 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-
-# Install latest bazel
-update_bazel_macos
-which bazel
-
-set_bazel_outdir
+install_bazelisk
 
 # Pick a version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
index f45e1281d49..b9a4157577d 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
index deeb2c81685..a5a5b6a34c4 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
@@ -17,11 +17,7 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-# Install latest bazel
-update_bazel_macos
-which bazel
-bazel version
-set_bazel_outdir
+install_bazelisk
 
 # Pick a more recent version of xcode
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
index 23ab9479d19..1a5124ecef3 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
@@ -18,8 +18,6 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-set_bazel_outdir
-
 # Install python dependencies
 install_ubuntu_16_pip_deps pip2.7
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
index 45adacae2c8..ba1861b221e 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
@@ -17,7 +17,6 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-set_bazel_outdir
 
 install_ubuntu_16_pip_deps pip3.5
 
@@ -59,7 +58,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
index f7310aea5e6..2b770867099 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
@@ -17,7 +17,6 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-set_bazel_outdir
 
 install_ubuntu_16_pip_deps pip3.6
 
@@ -59,7 +58,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
index efdbe62ae4f..25e59a5b096 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
@@ -17,7 +17,6 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-set_bazel_outdir
 
 install_ubuntu_16_pip_deps pip3.7
 
@@ -59,7 +58,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
index a9d27d77b2f..e82064f7221 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
@@ -17,10 +17,11 @@ set -e
 set -x
 
 source tensorflow/tools/ci_build/release/common.sh
-set_bazel_outdir
 
 install_ubuntu_16_pip_deps pip3.8
 
+pip3.7 install --upgrade auditwheel --user
+
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
@@ -57,7 +58,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
index 368c66515cc..3635f4289c8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
@@ -18,8 +18,6 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-set_bazel_outdir
-
 install_ubuntu_16_pip_deps pip2.7
 
 install_bazelisk
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
index 05962179a5c..ae5524f01ae 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
@@ -18,8 +18,6 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-set_bazel_outdir
-
 install_ubuntu_16_pip_deps pip3.5
 
 install_bazelisk
@@ -68,7 +66,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
index c3fefae8ff7..776f8d87ef8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
@@ -18,8 +18,6 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-set_bazel_outdir
-
 install_ubuntu_16_pip_deps pip3.6
 
 install_bazelisk
@@ -68,7 +66,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
index c12fb936ba7..693d6d9e44f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nightly_release.sh
@@ -18,8 +18,6 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-set_bazel_outdir
-
 install_ubuntu_16_pip_deps pip3.7
 
 install_bazelisk
@@ -68,7 +66,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
new file mode 100644
index 00000000000..7a1f24a29ec
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+
+pip3.7 install --upgrade auditwheel --user
+
+update_bazel_linux
+
+python2.7 tensorflow/tools/ci_build/update_version.py --nightly
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Build the pip package
+bazel build --config=opt --config=v2 \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  tensorflow/tools/pip_package:build_pip_package
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
+
+# Upload the built packages to pypi.
+for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
+
+  WHL_DIR=$(dirname "${WHL_PATH}")
+  WHL_BASE_NAME=$(basename "${WHL_PATH}")
+  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
+
+  # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
+  WHL_PATH=${AUDITED_WHL_NAME}
+  cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
+  echo "Copied manylinux2010 wheel file at: ${WHL_PATH}"
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
+  else
+    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
+    return 1
+  fi
+done
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
new file mode 100644
index 00000000000..639ba9edb5a
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+update_bazel_linux
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
+
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
new file mode 100644
index 00000000000..28b633c390e
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+# Update bazel
+update_bazel_linux
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="GPU"
+export TF_PYTHON_VERSION='python3.8'
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
+export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+--distinct_host_configuration=false \
+--action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
+--config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
+--verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
+--run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export IS_NIGHTLY=0 # Not nightly
+export TF_PROJECT_NAME=${PROJECT_NAME}
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=1
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
index 9d5488a7236..4a7a628b5bc 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
@@ -35,6 +35,7 @@ export TF2_BEHAVIOR=1
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
+test_patterns=(//tensorflow/... -//tensorflow/compiler/... -//tensorflow/lite/...)
 tag_filters="tpu,-no_tpu,-notpu,-no_oss,-no_oss_py37"
 
 bazel_args=(
@@ -45,14 +46,18 @@ bazel_args=(
   --noincompatible_strict_action_env \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" \
-  --test_output=errors --verbose_failures=true --keep_going \
+  --test_output=errors --verbose_failures=true --keep_going
+)
+
+bazel build "${bazel_args[@]}" -- "${test_patterns[@]}"
+
+ctpu_up -s v2-8 -p tensorflow-testing-tpu
+
+test_args=(
   --test_arg=--tpu="${TPU_NAME}" \
   --test_arg=--zone="${TPU_ZONE}" \
   --test_arg=--test_dir_base=gs://kokoro-tpu-testing/tempdir/ \
-  --local_test_jobs=1 \
-  -- //tensorflow/... -//tensorflow/compiler/... -//tensorflow/lite/...
+  --local_test_jobs=1
 )
 
-bazel build "${bazel_args[@]}"
-ctpu_up -s v2-8 -p tensorflow-testing-tpu
-bazel test "${bazel_args[@]}"
+bazel test "${bazel_args[@]}" "${test_args[@]}" -- "${test_patterns[@]}"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
new file mode 100644
index 00000000000..65692431469
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
new file mode 100644
index 00000000000..1214812cc46
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/nightly_release.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu" --extra_build_flags "--config=v2"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
new file mode 100644
index 00000000000..7a7435b3713
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release_pip_rename.sh
old mode 100755
new mode 100644
similarity index 53%
rename from third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
rename to tensorflow/tools/ci_build/release/windows/cpu_py38_full/release_pip_rename.sh
index 45c0285d232..43982623109
--- a/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release_pip_rename.sh
@@ -1,23 +1,25 @@
-# pylint: disable=g-bad-file-header
-# Copyright 2017 The Bazel Authors. All rights reserved.
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# ==============================================================================
+set -e
+set -x
 
-"""Skylark rule that stubs a toolchain."""
+source tensorflow/tools/ci_build/release/common.sh
 
-def _dummy_toolchain_impl(ctx):
-    ctx = ctx  # unused argument
-    toolchain = platform_common.ToolchainInfo()
-    return [toolchain]
-
-dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
+# Rename to tensorflow_cpu
+for f in $(ls py_test_dir/tensorflow-*cp3*-cp3*m-win_amd64.whl); do
+  copy_to_new_project_name "${f}" tensorflow_cpu
+  rm "${f}"
+done
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
new file mode 100644
index 00000000000..247487fa926
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
old mode 100755
new mode 100644
similarity index 70%
rename from third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
rename to tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
index e896e654fd7..0a208440148
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/nightly_release.bat
@@ -1,4 +1,4 @@
-:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 ::
 :: Licensed under the Apache License, Version 2.0 (the "License");
 :: you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@
 :: limitations under the License.
 :: =============================================================================
 
-:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
-@echo OFF
-set arg0=%~0
-for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
-"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly --extra_build_flags "--config=v2"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
new file mode 100644
index 00000000000..fc1c600fa5e
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release.bat
@@ -0,0 +1,23 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+
+for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release_pip_rename.sh
old mode 100755
new mode 100644
similarity index 55%
rename from third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
rename to tensorflow/tools/ci_build/release/windows/gpu_py38_full/release_pip_rename.sh
index 5a5465cc968..039f9516d86
--- a/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
+++ b/tensorflow/tools/ci_build/release/windows/gpu_py38_full/release_pip_rename.sh
@@ -1,25 +1,24 @@
 #!/bin/bash
-#
-# Copyright 2015 The Bazel Authors. All rights reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-# Ship the environment to the C++ action
-#
-set -eu
+# ==============================================================================
+set -e
+set -x
 
-# Set-up the environment
+source tensorflow/tools/ci_build/release/common.sh
 
-
-# Call the C++ compiler
-/opt/rh/devtoolset-7/root/usr/bin/gcc "$@"
+# Copy and rename to tensorflow
+for f in $(ls py_test_dir/tensorflow-*cp3*-cp3*m-win_amd64.whl); do
+  copy_to_new_project_name "${f}" tensorflow_gpu
+done
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 7bf9f560e00..1298479009b 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -340,3 +340,6 @@ tensorflow::grappler::AnalyticalCostEstimator::PredictCosts
 [cost_analyzer_lib] # cost_analyzer
 tensorflow::grappler::CostAnalyzer::CostAnalyzer
 tensorflow::grappler::CostAnalyzer::GenerateReport
+
+[flags] # tfe
+tensorflow::IsXlaEnabled
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index cb1bfe39c6c..dcc5444d53c 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -73,7 +73,8 @@ flags.DEFINE_bool("search_hints", True,
                   "Include meta-data search hints at the top of each file.")
 
 flags.DEFINE_string(
-    "site_path", "", "The prefix ({site-path}/api_docs/python/...) used in the "
+    "site_path", "",
+    "The path prefix (up to `.../api_docs/python`) used in the "
     "`_toc.yaml` and `_redirects.yaml` files")
 
 _PRIVATE_MAP = {
@@ -115,9 +116,11 @@ def generate_raw_ops_doc():
       has_gradient = "\N{HEAVY CHECK MARK}\N{VARIATION SELECTOR-16}"
     except LookupError:
       has_gradient = "\N{CROSS MARK}"
-    link = (
-        '<a id={op_name} href="{FLAGS.site_path}/api_docs/python/tf/raw_ops">'
-        '{op_name}</a>').format(op_name=op_name, FLAGS=FLAGS)
+    path = pathlib.Path("/") / FLAGS.site_path / "tf/raw_ops" / op_name
+    path = path.with_suffix(".md")
+    link = ('<a id={op_name} href="{path}">'
+            "{op_name}</a>").format(
+                op_name=op_name, path=str(path))
     parts.append(
         "| {link} | {has_gradient} |".format(link=link,
                                              has_gradient=has_gradient))
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 8a47f4c4c2d..c1f0577f33b 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -13,5 +13,4 @@ py_binary(
     srcs = ["gen_git_source.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
 )
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 011406e2288..0cb1d006142 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -35,8 +35,6 @@ import os
 import shutil
 import subprocess
 
-import six
-
 
 def parse_branch_ref(filename):
   """Given a filename of a .git/HEAD file return ref path.
@@ -169,8 +167,8 @@ def get_git_version(git_base_path, git_tag_override):
         subprocess.check_output([
             "git",
             str("--git-dir=%s/.git" % git_base_path),
-            str("--work-tree=" + six.ensure_str(git_base_path)), "describe",
-            "--long", "--tags"
+            str("--work-tree=%s" % git_base_path), "describe", "--long",
+            "--tags"
         ]).strip())
     version_separator = b"-"
     if git_tag_override and val:
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 0a9e5d151e0..1479c0177e7 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -161,7 +161,6 @@ genrule(
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png//:LICENSE",
-        "@six_archive//:LICENSE",
         "@snappy//:COPYING",
         "@sobol_data//:LICENSE",
         "@zlib//:zlib.h",
@@ -244,7 +243,6 @@ genrule(
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png//:LICENSE",
-        "@six_archive//:LICENSE",
         "@snappy//:COPYING",
         "@sobol_data//:LICENSE",
         "@zlib//:zlib.h",
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 41652b1311a..f0d180209f3 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -2,6 +2,7 @@ include LICENSE
 include README
 recursive-include * *.py
 recursive-include * *.pyd
+recursive-include * *.pyi
 recursive-include * *.pd
 recursive-include * *.so
 recursive-include * *.so.[0-9]
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index d72067ed82a..8cde1ce9499 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -165,6 +165,18 @@ function prepare_src() {
 
   rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.so
   rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.so.[0-9].*
+
+  # Create a keras/__init__.pyi file so that autocomplete for imports
+  # such as `from tensorflow.keras import losses` works.
+  # TODO(annarev): copy over API files from tensorflow/api/_vN to tensorflow/
+  #   except tensorflow/api/_vN/lite/.
+  mkdir ${TMPDIR}/tensorflow/keras/
+  if [ -d "${TMPDIR}/tensorflow/_api/v1/" ]
+  then
+    echo "from tensorflow.python.keras.api._v1.keras import *" > ${TMPDIR}/tensorflow/keras/__init__.pyi
+  else
+    echo "from tensorflow.python.keras.api._v2.keras import *" > ${TMPDIR}/tensorflow/keras/__init__.pyi
+  fi
 }
 
 function build_wheel() {
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 95a9afa9d5a..319f6f9f260 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "ed56652dd237deb86ee9bf102c18de5f2625c059e5ab1d7512c8dc01e316b694",
-        strip_prefix = "mkl-dnn-0.21.2",
+        sha256 = "31e78581e59d7e60d4becaba3834fc6a5bf2dccdae3e16b7f70d89ceab38423f",
+        strip_prefix = "mkl-dnn-0.21.3",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.21.2.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/v0.21.2.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.21.3.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/v0.21.3.tar.gz",
         ],
     )
 
@@ -201,11 +201,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "71905cca5553804beee85e9ab8b254931d3cbeda8df1a40e5af3773f5b657179",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-3fda850c46e5e589668a85d89299433e0686eec9",
+        sha256 = "88e95180a7eae9acd3e79d2efeea1026eefad9f515a44418b63b189a1887108c",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/3fda850c46e5e589668a85d89299433e0686eec9/eigen-3fda850c46e5e589668a85d89299433e0686eec9.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/3fda850c46e5e589668a85d89299433e0686eec9/eigen-3fda850c46e5e589668a85d89299433e0686eec9.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c/eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c/eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c.tar.gz",
         ],
     )
 
@@ -305,12 +305,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "org_sqlite",
         build_file = clean_dep("//third_party:sqlite.BUILD"),
-        sha256 = "adf051d4c10781ea5cfabbbc4a2577b6ceca68590d23b58b8260a8e24cc5f081",
-        strip_prefix = "sqlite-amalgamation-3300100",
+        sha256 = "f3c79bc9f4162d0b06fa9fe09ee6ccd23bb99ce310b792c5145f87fbcc30efca",
+        strip_prefix = "sqlite-amalgamation-3310100",
         system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2019/sqlite-amalgamation-3300100.zip",
-            "https://www.sqlite.org/2019/sqlite-amalgamation-3300100.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3310100.zip",
+            "https://www.sqlite.org/2020/sqlite-amalgamation-3310100.zip",
         ],
     )
 
@@ -424,8 +424,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "absl_py",
-        sha256 = "280c76ec0c9ab7a1dff550cdc37b7c7cd28551103dc3955202760ea8e381aa9d",
-        strip_prefix = "abseil-py-pypi-v0.8.0",
+        sha256 = "603febc9b95a8f2979a7bdb77d2f5e4d9b30d4e0d59579f88eba67d4e4cc5462",
+        strip_prefix = "abseil-py-pypi-v0.9.0",
         system_build_file = clean_dep("//third_party/systemlibs:absl_py.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:absl_py.absl.BUILD": "absl/BUILD",
@@ -433,8 +433,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
         },
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.8.0.tar.gz",
-            "https://github.com/abseil/abseil-py/archive/pypi-v0.8.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
+            "https://github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
         ],
     )
 
@@ -597,8 +597,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "da147ef0a5c6d31c21d31a52b97235a629830c15"
-    LLVM_SHA256 = "b5f85e5338f3ef7fd5f16f1307471f8545705985bd2e5423f67b58f58aedf24b"
+    LLVM_COMMIT = "fee41517fe0f7ff9f0e204dd9200ebf32ca03cb8"
+    LLVM_SHA256 = "dceb84396e8c30348dbd426c53eeae6657f5c67a24830c9a610a037fffcbe5cf"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index 941f03792e6..fd355eeceb1 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -56,6 +56,8 @@ cc_library(
         "aws-cpp-sdk-core/source/utils/crypto/factory/**/*.cpp",
         "aws-cpp-sdk-s3/include/**/*.h",
         "aws-cpp-sdk-s3/source/**/*.cpp",
+        "aws-cpp-sdk-transfer/include/**/*.h",
+        "aws-cpp-sdk-transfer/source/**/*.cpp",
         "aws-cpp-sdk-core/source/monitoring/*.cpp",
         "aws-cpp-sdk-core/source/net/linux-shared/*.cpp",
         "aws-cpp-sdk-core/source/utils/memory/*.cpp",
@@ -97,6 +99,7 @@ cc_library(
     includes = [
         "aws-cpp-sdk-core/include/",
         "aws-cpp-sdk-s3/include/",
+        "aws-cpp-sdk-transfer/include/",
     ],
     deps = [
         "@aws-c-common",
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index c80a2d2fce2..a89838ebac9 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -575,7 +575,8 @@ gentbl(
     name = "amdgpu_isel_target_gen",
     tbl_outs = [
         ("-gen-global-isel", "lib/Target/AMDGPU/AMDGPUGenGlobalISel.inc"),
-        ("-gen-global-isel-combiner -combiners=AMDGPUPreLegalizerCombinerHelper", "lib/Target/AMDGPU/AMDGPUGenGICombiner.inc"),
+        ("-gen-global-isel-combiner -combiners=AMDGPUPreLegalizerCombinerHelper", "lib/Target/AMDGPU/AMDGPUGenPreLegalizeGICombiner.inc"),
+        ("-gen-global-isel-combiner -combiners=AMDGPUPostLegalizerCombinerHelper", "lib/Target/AMDGPU/AMDGPUGenPostLegalizeGICombiner.inc"),
     ],
     tblgen = ":llvm-tblgen",
     td_file = "lib/Target/AMDGPU/AMDGPUGISel.td",
@@ -739,6 +740,7 @@ cc_library(
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
         ":mc",
+        ":object",
         ":support",
     ],
 )
@@ -1155,6 +1157,7 @@ cc_library(
         ":intrinsics_impl_gen",
         ":mc",
         ":mc_disassembler",
+        ":object",
         ":support",
     ],
 )
@@ -1889,6 +1892,7 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":config",
+        ":debug_info_dwarf",
         ":mc",
         ":support",
     ],
@@ -3198,6 +3202,7 @@ cc_library(
         ":code_gen",
         ":config",
         ":core",
+        ":coroutines",
         ":inst_combine",
         ":instrumentation",
         ":ipo",
@@ -3284,6 +3289,7 @@ cc_library(
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
         ":mc",
+        ":object",
         ":powerpc_info",
         ":powerpc_target_gen",
         ":support",
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 39018342176..71dde75e2e0 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -45,7 +45,7 @@ template_rule(
     substitutions = {
         "@MKLDNN_VERSION_MAJOR@": "0",
         "@MKLDNN_VERSION_MINOR@": "21",
-        "@MKLDNN_VERSION_PATCH@": "2",
+        "@MKLDNN_VERSION_PATCH@": "3",
         "@MKLDNN_VERSION_HASH@": "N/A",
     },
 )
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index efab4468ed5..3920a97c236 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -197,11 +197,28 @@ gentbl(
     ],
 )
 
+cc_library(
+    name = "LoopOpsTransforms",
+    srcs = glob(["lib/Dialect/LoopOps/Transforms/*.cpp"]),
+    hdrs = ["include/mlir/Dialect/LoopOps/Passes.h"],
+    includes = ["include"],
+    deps = [
+        ":AffineOps",
+        ":IR",
+        ":LoopOps",
+        ":Pass",
+        ":StandardOps",
+        ":Transforms",
+        "@llvm-project//llvm:support",
+    ],
+    alwayslink = 1,
+)
+
 filegroup(
     name = "StdOpsTdFiles",
     srcs = [
         "include/mlir/Analysis/CallInterfaces.td",
-        "include/mlir/Dialect/StandardOps/Ops.td",
+        "include/mlir/Dialect/StandardOps/IR/Ops.td",
         "include/mlir/IR/OpAsmInterface.td",
         ":OpBaseTdFiles",
     ],
@@ -213,23 +230,23 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/Dialect/StandardOps/Ops.h.inc",
+            "include/mlir/Dialect/StandardOps/IR/Ops.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/Dialect/StandardOps/Ops.cpp.inc",
+            "include/mlir/Dialect/StandardOps/IR/Ops.cpp.inc",
         ),
         (
             "-gen-enum-decls",
-            "include/mlir/Dialect/StandardOps/OpsEnums.h.inc",
+            "include/mlir/Dialect/StandardOps/IR/OpsEnums.h.inc",
         ),
         (
             "-gen-enum-defs",
-            "include/mlir/Dialect/StandardOps/OpsEnums.cpp.inc",
+            "include/mlir/Dialect/StandardOps/IR/OpsEnums.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/StandardOps/Ops.td",
+    td_file = "include/mlir/Dialect/StandardOps/IR/Ops.td",
     td_srcs = [
         ":StdOpsTdFiles",
     ],
@@ -369,13 +386,13 @@ cc_library(
     name = "StandardOps",
     srcs = glob(
         [
-            "lib/Dialect/StandardOps/*.cpp",
-            "lib/Dialect/StandardOps/*.h",
+            "lib/Dialect/StandardOps/IR/*.cpp",
+            "lib/Dialect/StandardOps/IR/*.h",
             "lib/Dialect/StandardOps/EDSC/*.cpp",
         ],
     ),
     hdrs = glob([
-        "include/mlir/Dialect/StandardOps/*.h",
+        "include/mlir/Dialect/StandardOps/IR/*.h",
         "include/mlir/Dialect/StandardOps/EDSC/*.h",
     ]) + [
         "include/mlir/Analysis/CallInterfaces.h",
@@ -1664,6 +1681,7 @@ cc_library(
         ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LoopOpsTransforms",
         ":NVVMDialect",
         ":Parser",
         ":Pass",
@@ -1677,6 +1695,7 @@ cc_library(
         "@llvm-project//mlir/test:TestDialect",
         "@llvm-project//mlir/test:TestIR",
         "@llvm-project//mlir/test:TestPass",
+        "@llvm-project//mlir/test:TestSPIRV",
         "@llvm-project//mlir/test:TestTransforms",
     ],
 )
@@ -1746,6 +1765,7 @@ cc_library(
         ":LinalgToSPIRV",
         ":LinalgTransforms",
         ":LoopOps",
+        ":LoopOpsTransforms",
         ":LoopsToGPUPass",
         ":NVVMDialect",
         ":OpenMPDialect",
@@ -1803,6 +1823,7 @@ cc_binary(
         "@llvm-project//mlir/test:TestDialect",
         "@llvm-project//mlir/test:TestIR",
         "@llvm-project//mlir/test:TestPass",
+        "@llvm-project//mlir/test:TestSPIRV",
         "@llvm-project//mlir/test:TestTransforms",
     ],
 )
@@ -2502,7 +2523,7 @@ exports_files(
         "include/mlir/Analysis/CallInterfaces.h",
         "include/mlir/Analysis/CallInterfaces.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
-        "include/mlir/Dialect/StandardOps/Ops.td",
+        "include/mlir/Dialect/StandardOps/IR/Ops.td",
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/IR/OpBase.td",
         "include/mlir/Transforms/InliningUtils.h",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 1e89b553ac4..657d011254b 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -166,3 +166,16 @@ cc_library(
         "@llvm-project//mlir:VectorToLoops",
     ],
 )
+
+cc_library(
+    name = "TestSPIRV",
+    srcs = glob([
+        "lib/Dialect/SPIRV/*.cpp",
+    ]),
+    deps = [
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SPIRVDialect",
+        "@llvm-project//mlir:SPIRVLowering",
+    ],
+)
diff --git a/third_party/remote_config/BUILD.tpl b/third_party/remote_config/BUILD.tpl
index 76f360f3e72..7bcee410756 100644
--- a/third_party/remote_config/BUILD.tpl
+++ b/third_party/remote_config/BUILD.tpl
@@ -2,10 +2,7 @@ platform(
     name = "platform",
     constraint_values = [
         "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:%{platform}",
     ],
-    exec_properties = {
-        "container-image": "%{container_image}",
-        "Pool": "default",
-    },
+    exec_properties = %{exec_properties},
 )
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
index 140cd222e43..5a78a8320b9 100644
--- a/third_party/remote_config/common.bzl
+++ b/third_party/remote_config/common.bzl
@@ -24,7 +24,7 @@ def which(repository_ctx, program_name):
     if is_windows(repository_ctx):
         if not program_name.endswith(".exe"):
             program_name = program_name + ".exe"
-        result = execute(repository_ctx, ["where.exe", program_name])
+        result = execute(repository_ctx, ["C:\\Windows\\System32\\where.exe", program_name])
     else:
         result = execute(repository_ctx, ["which", program_name])
     return result.stdout.rstrip()
@@ -90,7 +90,7 @@ def read_dir(repository_ctx, src_dir):
         src_dir = src_dir.replace("/", "\\")
         find_result = execute(
             repository_ctx,
-            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            ["C:\\Windows\\System32\\cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
             empty_stdout_fine = True,
         )
 
@@ -121,7 +121,7 @@ def get_environ(repository_ctx, name, default_value = None):
     if is_windows(repository_ctx):
         result = execute(
             repository_ctx,
-            ["cmd.exe", "/c", "echo", "%" + name + "%"],
+            ["C:\\Windows\\System32\\cmd.exe", "/c", "echo", "%" + name + "%"],
             empty_stdout_fine = True,
         )
     else:
diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl
index 175649da643..5c2918b8e84 100644
--- a/third_party/remote_config/remote_platform_configure.bzl
+++ b/third_party/remote_config/remote_platform_configure.bzl
@@ -1,17 +1,27 @@
 """Repository rule to create a platform for a docker image to be used with RBE."""
 
 def _remote_platform_configure_impl(repository_ctx):
+    platform = repository_ctx.attr.platform
+    exec_properties = repository_ctx.attr.platform_exec_properties
+
+    serialized_exec_properties = "{"
+    for k, v in exec_properties.items():
+        serialized_exec_properties += "\"%s\" : \"%s\"," % (k, v)
+    serialized_exec_properties += "}"
+
     repository_ctx.template(
         "BUILD",
         Label("@org_tensorflow//third_party/remote_config:BUILD.tpl"),
         {
-            "%{container_image}": repository_ctx.attr.container_image,
+            "%{platform}": platform,
+            "%{exec_properties}": serialized_exec_properties,
         },
     )
 
 remote_platform_configure = repository_rule(
     implementation = _remote_platform_configure_impl,
     attrs = {
-        "container_image": attr.string(mandatory = True),
+        "platform_exec_properties": attr.string_dict(mandatory = True),
+        "platform": attr.string(default = "linux", values = ["linux", "windows"]),
     },
 )
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index d93f0307690..a2ab4924f29 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -27,6 +27,10 @@ cc_library(
             "-Wno-implicit-function-declaration",
         ],
     }),
+    defines = select({
+        "@org_tensorflow//tensorflow:windows": [],
+        "//conditions:default": ["HAVE_SYS_UIO_H"],
+    }),
 )
 
 genrule(
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
deleted file mode 100755
index 3a3421001bb..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1283 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.0"],
-    data = ["cuda/lib/libcudart.so.10.0"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10.0"],
-    data = ["cuda/lib/libcublas.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10.0"],
-    data = ["cuda/lib/libcusolver.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10.0"],
-    data = ["cuda/lib/libcufft.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10.0"],
-    data = ["cuda/lib/libcurand.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.0"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10.0"],
-    data = ["cuda/lib/libcusparse.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cublas.h",
-        "cuda/include/cublasXt.h",
-        "cuda/include/cublas_api.h",
-        "cuda/include/cublas_v2.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cudnn.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvblas.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/type_traits.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/GL/gl.h",
-        "cuda/extras/CUPTI/include/GL/glew.h",
-        "cuda/extras/CUPTI/include/GL/glext.h",
-        "cuda/extras/CUPTI/include/GL/glu.h",
-        "cuda/extras/CUPTI/include/GL/glut.h",
-        "cuda/extras/CUPTI/include/GL/glx.h",
-        "cuda/extras/CUPTI/include/GL/glxext.h",
-        "cuda/extras/CUPTI/include/GL/wglew.h",
-        "cuda/extras/CUPTI/include/GL/wglext.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/openmp/ompt.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.0",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10.0",
-        "cuda/lib/libcusolver.so.10.0",
-        "cuda/lib/libcurand.so.10.0",
-        "cuda/lib/libcufft.so.10.0",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.0",
-        "cuda/lib/libcusparse.so.10.0",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" "$(location cuda/lib/libcudart.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" "$(location cuda/lib/libcublas.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" "$(location cuda/lib/libcusolver.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" "$(location cuda/lib/libcurand.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" "$(location cuda/lib/libcufft.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" "$(location cuda/lib/libcupti.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" "$(location cuda/lib/libcusparse.so.10.0)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/bin2c",
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/crt/prelink.stub",
-        "cuda/bin/cuda-gdb",
-        "cuda/bin/cuda-gdbserver",
-        "cuda/bin/cuda-memcheck",
-        "cuda/bin/cudafe++",
-        "cuda/bin/cuobjdump",
-        "cuda/bin/fatbinary",
-        "cuda/bin/gpu-library-advisor",
-        "cuda/bin/nvcc",
-        "cuda/bin/nvcc.profile",
-        "cuda/bin/nvdisasm",
-        "cuda/bin/nvlink",
-        "cuda/bin/nvprof",
-        "cuda/bin/nvprune",
-        "cuda/bin/ptxas",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index 72472e4c224..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,64 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
deleted file mode 100755
index f64204c4920..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1357 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.1"],
-    data = ["cuda/lib/libcudart.so.10.1"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10"],
-    data = ["cuda/lib/libcublas.so.10"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10"],
-    data = ["cuda/lib/libcusolver.so.10"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10"],
-    data = ["cuda/lib/libcufft.so.10"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10"],
-    data = ["cuda/lib/libcurand.so.10"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.1"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10"],
-    data = ["cuda/lib/libcusparse.so.10"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cudnn.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/fatbinary_section.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/addressof.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/allocate_unique.h",
-        "cuda/include/thrust/async/copy.h",
-        "cuda/include/thrust/async/for_each.h",
-        "cuda/include/thrust/async/reduce.h",
-        "cuda/include/thrust/async/sort.h",
-        "cuda/include/thrust/async/transform.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/allocator_aware_execution_policy.h",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/cpp_compatibility.h",
-        "cuda/include/thrust/detail/config/cpp_dialect.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cpp11_required.h",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/dependencies_aware_execution_policy.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/event_error.h",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execute_with_allocator_fwd.h",
-        "cuda/include/thrust/detail/execute_with_dependencies.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/memory_algorithms.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/modern_gcc_required.h",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/select_system.h",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_algorithms.h",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_deduction.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_make_unique.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/event.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/future.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/limits.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/memory/detail/device_system_resource.h",
-        "cuda/include/thrust/memory/detail/host_system_resource.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/mr/allocator.h",
-        "cuda/include/thrust/mr/detail/config.h",
-        "cuda/include/thrust/mr/disjoint_pool.h",
-        "cuda/include/thrust/mr/disjoint_sync_pool.h",
-        "cuda/include/thrust/mr/disjoint_tls_pool.h",
-        "cuda/include/thrust/mr/fancy_pointer_resource.h",
-        "cuda/include/thrust/mr/memory_resource.h",
-        "cuda/include/thrust/mr/new.h",
-        "cuda/include/thrust/mr/polymorphic_adaptor.h",
-        "cuda/include/thrust/mr/pool.h",
-        "cuda/include/thrust/mr/pool_options.h",
-        "cuda/include/thrust/mr/sync_pool.h",
-        "cuda/include/thrust/mr/tls_pool.h",
-        "cuda/include/thrust/mr/validator.h",
-        "cuda/include/thrust/optional.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/per_device_resource.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/per_device_resource.h",
-        "cuda/include/thrust/system/cpp/detail/pointer.inl",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/memory_resource.h",
-        "cuda/include/thrust/system/cpp/pointer.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/async/copy.h",
-        "cuda/include/thrust/system/cuda/detail/async/customization.h",
-        "cuda/include/thrust/system/cuda/detail/async/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/async/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/async/sort.h",
-        "cuda/include/thrust/system/cuda/detail/async/transform.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/future.inl",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/per_device_resource.h",
-        "cuda/include/thrust/system/cuda/detail/pointer.inl",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/future.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/memory_resource.h",
-        "cuda/include/thrust/system/cuda/pointer.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/async/copy.h",
-        "cuda/include/thrust/system/detail/adl/async/for_each.h",
-        "cuda/include/thrust/system/detail/adl/async/reduce.h",
-        "cuda/include/thrust/system/detail/adl/async/sort.h",
-        "cuda/include/thrust/system/detail/adl/async/transform.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/per_device_resource.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/per_device_resource.h",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/select_system.inl",
-        "cuda/include/thrust/system/detail/generic/select_system_exists.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/per_device_resource.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/per_device_resource.h",
-        "cuda/include/thrust/system/omp/detail/pointer.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/memory_resource.h",
-        "cuda/include/thrust/system/omp/pointer.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/per_device_resource.h",
-        "cuda/include/thrust/system/tbb/detail/pointer.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/memory_resource.h",
-        "cuda/include/thrust/system/tbb/pointer.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/type_traits/integer_sequence.h",
-        "cuda/include/thrust/type_traits/is_contiguous_iterator.h",
-        "cuda/include/thrust/type_traits/is_execution_policy.h",
-        "cuda/include/thrust/type_traits/is_operator_less_or_greater_function_object.h",
-        "cuda/include/thrust/type_traits/is_operator_plus_function_object.h",
-        "cuda/include/thrust/type_traits/is_trivially_relocatable.h",
-        "cuda/include/thrust/type_traits/logical_metafunctions.h",
-        "cuda/include/thrust/type_traits/remove_cvref.h",
-        "cuda/include/thrust/type_traits/void_t.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/Openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/Openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/Openmp/ompt.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_profiler_target.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_target.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/nvperf_cuda_host.h",
-        "cuda/extras/CUPTI/include/nvperf_host.h",
-        "cuda/extras/CUPTI/include/nvperf_target.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.1",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10",
-        "cuda/lib/libcusolver.so.10",
-        "cuda/lib/libcurand.so.10",
-        "cuda/lib/libcufft.so.10",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.1",
-        "cuda/lib/libcusparse.so.10",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.1/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcudart.so.10.1" "$(location cuda/lib/libcudart.so.10.1)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/lib64/libcublas.so.10" "$(location cuda/lib/libcublas.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcusolver.so.10" "$(location cuda/lib/libcusolver.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcurand.so.10" "$(location cuda/lib/libcurand.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcufft.so.10" "$(location cuda/lib/libcufft.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.1/extras/CUPTI/lib64/libcupti.so.10.1" "$(location cuda/lib/libcupti.so.10.1)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcusparse.so.10" "$(location cuda/lib/libcusparse.so.10)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/bin2c",
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/crt/prelink.stub",
-        "cuda/bin/cuda-gdb",
-        "cuda/bin/cuda-gdbserver",
-        "cuda/bin/cuda-memcheck",
-        "cuda/bin/cudafe++",
-        "cuda/bin/cuobjdump",
-        "cuda/bin/fatbinary",
-        "cuda/bin/gpu-library-advisor",
-        "cuda/bin/nvcc",
-        "cuda/bin/nvcc.profile",
-        "cuda/bin/nvdisasm",
-        "cuda/bin/nvlink",
-        "cuda/bin/nvprof",
-        "cuda/bin/nvprune",
-        "cuda/bin/ptxas",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/bin/." "$(@D)/cuda/bin/" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.1/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index fe5ddff6572..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,65 +0,0 @@
-"""Macros for building CUDA code."""
-
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h
deleted file mode 100755
index f7e84335e6b..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
-
-#define TF_CUDA_VERSION "10.1"
-#define TF_CUDA_LIB_VERSION "10"
-#define TF_CUDNN_VERSION "7"
-
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.1"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
deleted file mode 100755
index 6b94c9a1e12..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
+++ /dev/null
@@ -1,170 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/local/cuda-10.0/include",
-    ],
-    cpu = "local",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_config = ":cc-compiler-local-darwin",
-    toolchain_identifier = "local_darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/local/cuda-10.0/include",
-    ],
-    cpu = "darwin",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = ":windows_msvc_wrapper_files",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":windows_msvc_wrapper_files",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-windows-config",
-    toolchain_identifier = "local_windows",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/local/cuda-10.0/include",
-    ],
-    cpu = "x64_windows",
-    msvc_cl_path = "msvc_not_used",
-    msvc_env_include = "msvc_not_used",
-    msvc_env_lib = "msvc_not_used",
-    msvc_env_path = "msvc_not_used",
-    msvc_env_tmp = "msvc_not_used",
-    msvc_lib_path = "msvc_not_used",
-    msvc_link_path = "msvc_not_used",
-    msvc_ml_path = "msvc_not_used",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL
deleted file mode 100755
index 3e1ca08218f..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL
+++ /dev/null
@@ -1,1429 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lstdc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  feature {
-    name: "alwayslink"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,-no-as-needed"
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "build-id"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        # Stamp the binary with a unique identifier.
-        flag: "-Wl,--build-id=md5"
-        flag: "-Wl,--hash-style=gnu"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-no-canonical-prefixes"
-        flag: "-fno-canonical-system-headers"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-B/opt/rh/devtoolset-7/root/usr/bin"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "alwayslink"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "build-id"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,--gc-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/opt/rh/devtoolset-7/root/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/opt/rh/devtoolset-7/root/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/opt/rh/devtoolset-7/root/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/opt/rh/devtoolset-7/root/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/opt/rh/devtoolset-7/root/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/opt/rh/devtoolset-7/root/usr/bin/ld" }
-  tool_path { name: "nm" path: "/opt/rh/devtoolset-7/root/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/opt/rh/devtoolset-7/root/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/opt/rh/devtoolset-7/root/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/opt/rh/devtoolset-7/root/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag:"-no-canonical-prefixes"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-B/opt/rh/devtoolset-7/root/usr/bin"
-      }
-    }
-  }
-
-  feature {
-    name: "undefined-dynamic"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-undefined"
-        flag: "dynamic_lookup"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-    implies: "undefined-dynamic"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-}
-
-toolchain {
-  toolchain_identifier: "local_windows"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows"
-  compiler: "msvc-cl"
-  target_libc: "msvcrt"
-
-
-
-  tool_path {
-    name: "ar"
-    path: ""
-  }
-  tool_path {
-    name: "ml"
-    path: ""
-  }
-  tool_path {
-    name: "cpp"
-    path: ""
-  }
-  tool_path {
-    name: "gcc"
-    path: ""
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: ""
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_interface_shared_objects: true
-
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0600"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
-
-  # Useful options to have on for compilation.
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Use unsigned char by default.
-  compiler_flag: "/J"
-  # Use function level linking.
-  compiler_flag: "/Gy"
-  # Use string pooling.
-  compiler_flag: "/GF"
-  # Catch C++ exceptions only and tell the compiler to assume that functions declared
-  # as extern "C" never throw a C++ exception.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "/MACHINE:X64"
-
-  feature {
-    name: "no_legacy_features"
-  }
-
-  # TODO(klimek): Previously we were using a .bat file to start python to run
-  # the python script that can redirect to nvcc - unfortunately .bat files
-  # have a rather short maximum length for command lines (8k). Instead, we
-  # now use the python binary as the compiler and pass the python script to
-  # it at the start of the command line. Investigate different possibilities
-  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
-  # a small C++ wrapper to redirect.
-  feature {
-    name: "redirector"
-    enabled: true
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      flag_group {
-        flag: "-B"
-        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
-      }
-    }
-  }
-
-  # Suppress startup banner.
-  feature {
-    name: "nologo"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      flag_group {
-        flag: "/nologo"
-      }
-    }
-  }
-
-  feature {
-    name: 'has_configured_linker_path'
-  }
-
-  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
-  feature {
-    name: 'no_stripping'
-  }
-
-  # This feature indicates this is a toolchain targeting Windows.
-  feature {
-    name: 'targets_windows'
-    implies: 'copy_dynamic_libraries_to_binary'
-    enabled: true
-  }
-
-  feature {
-    name: 'copy_dynamic_libraries_to_binary'
-  }
-
-  action_config {
-    config_name: 'assemble'
-    action_name: 'assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'preprocess-assemble'
-    action_name: 'preprocess-assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-link-executable'
-    action_name: 'c++-link-executable'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-  }
-
-  action_config {
-    config_name: 'c++-link-dynamic-library'
-    action_name: 'c++-link-dynamic-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'shared_flag'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-    implies: 'has_configured_linker_path'
-    implies: 'def_file'
-  }
-
-  action_config {
-      config_name: 'c++-link-nodeps-dynamic-library'
-      action_name: 'c++-link-nodeps-dynamic-library'
-      tool {
-        tool_path: ''
-      }
-      implies: 'nologo'
-      implies: 'shared_flag'
-      implies: 'linkstamps'
-      implies: 'output_execpath_flags'
-      implies: 'input_param_flags'
-      implies: 'user_link_flags'
-      implies: 'legacy_link_flags'
-      implies: 'linker_subsystem_flag'
-      implies: 'linker_param_file'
-      implies: 'msvc_env'
-      implies: 'no_stripping'
-      implies: 'has_configured_linker_path'
-      implies: 'def_file'
-    }
-
-  action_config {
-    config_name: 'c++-link-static-library'
-    action_name: 'c++-link-static-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'archiver_flags'
-    implies: 'input_param_flags'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-  }
-
-  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
-  # not used in this crosstool
-  feature {
-    name: 'legacy_compile_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'legacy_compile_flags'
-        flag: '%{legacy_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "PATH"
-        value: ""
-      }
-      env_entry {
-        key: "INCLUDE"
-        value: ""
-      }
-      env_entry {
-        key: "LIB"
-        value: ""
-      }
-      env_entry {
-        key: "TMP"
-        value: ""
-      }
-      env_entry {
-        key: "TEMP"
-        value: ""
-      }
-    }
-  }
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: "assemble"
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      flag_group {
-        iterate_over: 'quote_include_paths'
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        iterate_over: 'include_paths'
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        iterate_over: 'system_include_paths'
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: "preprocessor_defines"
-    flag_set {
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-header-parsing"
-      action: "c++-module-compile"
-      flag_group {
-        flag: "/D%{preprocessor_defines}"
-        iterate_over: "preprocessor_defines"
-      }
-    }
-  }
-
-  # Tell Bazel to parse the output of /showIncludes
-  feature {
-    name: 'parse_showincludes'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-parsing'
-      flag_group {
-        flag: "/showIncludes"
-      }
-    }
-  }
-
-
-  feature {
-    name: 'generate_pdb_file'
-    requires: {
-      feature: 'dbg'
-    }
-    requires: {
-      feature: 'fastbuild'
-    }
-  }
-
-  feature {
-    name: 'shared_flag'
-    flag_set {
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/DLL'
-      }
-    }
-  }
-
-  feature {
-    name: 'linkstamps'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      expand_if_all_available: 'linkstamp_paths'
-      flag_group {
-        iterate_over: 'linkstamp_paths'
-        flag: '%{linkstamp_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'output_execpath_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'archiver_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'input_param_flags'
-    flag_set {
-      expand_if_all_available: 'interface_library_output_path'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/IMPLIB:%{interface_library_output_path}"
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libopts'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'libopts'
-        flag: '%{libopts}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libraries_to_link'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        iterate_over: 'libraries_to_link'
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file_group'
-          }
-          iterate_over: 'libraries_to_link.object_files'
-          flag_group {
-            flag: '%{libraries_to_link.object_files}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'interface_library'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'static_library'
-          }
-          flag_group {
-            expand_if_false: 'libraries_to_link.is_whole_archive'
-            flag: '%{libraries_to_link.name}'
-          }
-          flag_group {
-            expand_if_true: 'libraries_to_link.is_whole_archive'
-            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
-          }
-        }
-      }
-    }
-  }
-
-  # Since this feature is declared earlier in the CROSSTOOL than
-  # "user_link_flags", this feature will be applied prior to it anwyhere they
-  # are both implied. And since "user_link_flags" contains the linkopts from
-  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
-  # file.
-  feature {
-    name: 'linker_subsystem_flag'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/SUBSYSTEM:CONSOLE'
-      }
-    }
-  }
-
-  # The "user_link_flags" contains user-defined linkopts (from build rules)
-  # so it should be defined after features that declare user-overridable flags.
-  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
-  # but we want to let the user override it, therefore "link_flag_subsystem" is
-  # defined earlier in the CROSSTOOL file than "user_link_flags".
-  feature {
-    name: 'user_link_flags'
-    flag_set {
-      expand_if_all_available: 'user_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'user_link_flags'
-        flag: '%{user_link_flags}'
-      }
-    }
-  }
-  feature {
-    name: 'legacy_link_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'legacy_link_flags'
-        flag: '%{legacy_link_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'linker_param_file'
-    flag_set {
-      expand_if_all_available: 'linker_param_file'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '@%{linker_param_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'static_link_msvcrt'
-  }
-
-  feature {
-    name: 'static_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MT"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MD"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'static_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MTd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MDd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dbg'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FULL"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'fastbuild'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FASTLINK"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'opt'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/O2"
-        flag: "/DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: 'user_compile_flags'
-    flag_set {
-      expand_if_all_available: 'user_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'user_compile_flags'
-        flag: '%{user_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'sysroot'
-    flag_set {
-      expand_if_all_available: 'sysroot'
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'sysroot'
-        flag: '--sysroot=%{sysroot}'
-      }
-    }
-  }
-
-  feature {
-    name: 'unfiltered_compile_flags'
-    flag_set {
-      expand_if_all_available: 'unfiltered_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'unfiltered_compile_flags'
-        flag: '%{unfiltered_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_output_flags'
-    flag_set {
-      action: 'assemble'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-        flag: '/Zi'
-      }
-    }
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_assembly_file'
-        flag: '/Fa%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_preprocess_file'
-        flag: '/P'
-        flag: '/Fi%{output_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_input_flags'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'source_file'
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-  }
-
-  feature {
-    name : 'def_file',
-    flag_set {
-      expand_if_all_available: 'def_file_path'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEF:%{def_file_path}"
-        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
-        # the warning message about DLL name doesn't match the default one.
-        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
-        flag: "/ignore:4070"
-      }
-    }
-  }
-
-  feature {
-    name: 'windows_export_all_symbols'
-  }
-
-  feature {
-    name: 'no_windows_export_all_symbols'
-  }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
deleted file mode 100755
index 282ba08cda5..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1486 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
deleted file mode 100755
index d7dc3a6511a..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ /dev/null
@@ -1,267 +0,0 @@
-#!/usr/bin/env python2
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '10.0'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ["c++11"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in nvcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g -G')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-  nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
deleted file mode 100755
index 69fb0713d78..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-NVCC_VERSION = '10.0'
-NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise Error('No source files found for cuda compilation.')
-
-  out_file = [ f for f in argv if f.startswith('/Fo') ]
-  if len(out_file) != 1:
-    raise Error('Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ["-I " + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
-
-  m_options = ["-m64"]
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
-                          stdout=sys.stdout,
-                          stderr=sys.stderr,
-                          env=os.environ.copy(),
-                          shell=True)
-  proc.wait()
-  return proc.returncode
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))
-                             and not flag.startswith(('-nvcc_options'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
deleted file mode 100755
index a38be3f0373..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
+++ /dev/null
@@ -1,170 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.1/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.1/include",
-        "/usr/local/cuda-10.1/extras/CUPTI/include",
-        "/usr/local/cuda-10.1/include",
-    ],
-    cpu = "local",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_config = ":cc-compiler-local-darwin",
-    toolchain_identifier = "local_darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.1/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.1/include",
-        "/usr/local/cuda-10.1/extras/CUPTI/include",
-        "/usr/local/cuda-10.1/include",
-    ],
-    cpu = "darwin",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = ":windows_msvc_wrapper_files",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":windows_msvc_wrapper_files",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-windows-config",
-    toolchain_identifier = "local_windows",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.1/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.1/include",
-        "/usr/local/cuda-10.1/extras/CUPTI/include",
-        "/usr/local/cuda-10.1/include",
-    ],
-    cpu = "x64_windows",
-    msvc_cl_path = "msvc_not_used",
-    msvc_env_include = "msvc_not_used",
-    msvc_env_lib = "msvc_not_used",
-    msvc_env_path = "msvc_not_used",
-    msvc_env_tmp = "msvc_not_used",
-    msvc_lib_path = "msvc_not_used",
-    msvc_link_path = "msvc_not_used",
-    msvc_ml_path = "msvc_not_used",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
deleted file mode 100755
index 282ba08cda5..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1486 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
deleted file mode 100755
index 72a1fd95a95..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env python2
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.1/bin/nvcc'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '10.1'
-
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ('7.0',):
-    return nvcc_options
-
-  update_options = {'relaxed-constexpr': 'expt-relaxed-constexpr'}
-  return [
-      update_options[opt] if opt in update_options else opt
-      for opt in nvcc_options
-  ]
-
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--' + a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ['c++11']
-  std_options = ''.join([
-      ' -std=' + define
-      for define in std_options
-      if define in nvcc_allowed_std_options
-  ])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if
-         (len(opt_option) > 0 and int(opt_option[0]) > 0) else ' -g -G')
-
-  includes = (' -I ' +
-              ' -I '.join(include_options) if len(include_options) > 0 else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [
-      f for f in src_files if re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
-  ]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = ['3.0', '6.0']
-  nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (
-        NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
-        host_compiler_options + '"' + ' --compiler-bindir=' +
-        GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes + ' ' +
-        srcs + ' -M -o ' + depfile)
-    if log:
-      Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (
-      NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
-      host_compiler_options + ' -fPIC"' + ' --compiler-bindir=' +
-      GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes + ' -c ' +
-      srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log:
-    Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log:
-      Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log:
-      Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [
-      flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log'))
-  ]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py
deleted file mode 100755
index dfd63dd7968..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.1/bin/nvcc'
-NVCC_VERSION = '10.1'
-NVCC_TEMP_DIR = 'C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir'
-supported_cuda_compute_capabilities = ['3.0', '6.0']
-
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ('7.0',):
-    return nvcc_options
-
-  update_options = {'relaxed-constexpr': 'expt-relaxed-constexpr'}
-  return [
-      update_options[opt] if opt in update_options else opt
-      for opt in nvcc_options
-  ]
-
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise RuntimeError('No source files found for cuda compilation.')
-
-  out_file = [f for f in argv if f.startswith('/Fo')]
-  if len(out_file) != 1:
-    raise RuntimeError(
-        'Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ['-I ' + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [
-      option for option in argv if option not in (src_files + out_file)
-  ]
-
-  m_options = ['-m64']
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [
-        r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' %
-        (capability, capability, capability)
-    ]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + ' '.join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(
-      cmd,
-      stdout=sys.stdout,
-      stderr=sys.stderr,
-      env=os.environ.copy(),
-      shell=True)
-  proc.wait()
-  return proc.returncode
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log:
-      Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log:
-      Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [
-      flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log')) and
-      not flag.startswith(('-nvcc_options'))
-  ]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/BUILD b/third_party/toolchains/preconfig/centos6/gcc7/BUILD
deleted file mode 100755
index 5d97f20a7ac..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/BUILD
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2016 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
-
-package(default_visibility = ["//visibility:public"])
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "malloc",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "cc_wrapper",
-    srcs = ["cc_wrapper.sh"],
-)
-
-filegroup(
-    name = "compiler_deps",
-    srcs = glob(["extra_tools/**"]) + [":empty"],
-)
-
-# This is the entry point for --crosstool_top.  Toolchains are found
-# by lopping off the name of --crosstool_top and searching for
-# the "${CPU}" entry in the toolchains attribute.
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "k8|gcc": ":cc-compiler-k8",
-        "k8": ":cc-compiler-k8",
-        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
-        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-k8",
-    all_files = ":compiler_deps",
-    ar_files = ":empty",
-    as_files = ":empty",
-    compiler_files = ":compiler_deps",
-    dwp_files = ":empty",
-    linker_files = ":compiler_deps",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":linux_gnu_x86",
-    toolchain_identifier = "linux_gnu_x86",
-)
-
-cc_toolchain_config(
-    name = "linux_gnu_x86",
-    compiler = "gcc",
-    cpu = "k8",
-)
-
-toolchain(
-    name = "cc-toolchain-k8",
-    exec_compatible_with = [
-        # TODO(katre): add autodiscovered constraints for host CPU and OS.
-    ],
-    target_compatible_with = [
-        # TODO(katre): add autodiscovered constraints for host CPU and OS.
-    ],
-    toolchain = ":cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-# Android tooling requires a default toolchain for the armeabi-v7a cpu.
-cc_toolchain(
-    name = "cc-compiler-armeabi-v7a",
-    all_files = ":empty",
-    ar_files = ":empty",
-    as_files = ":empty",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":empty",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":stub_armeabi-v7a",
-    toolchain_identifier = "stub_armeabi-v7a",
-)
-
-cc_toolchain_config(
-    name = "stub_armeabi-v7a",
-    compiler = "compiler",
-    cpu = "armeabi-v7a",
-)
-
-toolchain(
-    name = "cc-toolchain-armeabi-v7a",
-    exec_compatible_with = [
-        # TODO(katre): add autodiscovered constraints for host CPU and OS.
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:arm",
-        "@bazel_tools//platforms:android",
-    ],
-    toolchain = ":cc-compiler-armabi-v7a",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE b/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
deleted file mode 100644
index bc05b4c36ff..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
-workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
deleted file mode 100755
index 182957487ae..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1734 +0,0 @@
-# Copyright 2019 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""A Starlark cc_toolchain configuration rule"""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "artifact_name_pattern",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "make_variable",  # @unused
-    "tool",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-all_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-    ACTION_NAMES.lto_backend,
-]
-
-all_cpp_compile_actions = [
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-]
-
-preprocessor_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.clif_match,
-]
-
-codegen_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.lto_backend,
-]
-
-all_link_actions = [
-    ACTION_NAMES.cpp_link_executable,
-    ACTION_NAMES.cpp_link_dynamic_library,
-    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-]
-
-def _windows_msvc_impl(ctx):
-    toolchain_identifier = "msvc_x64"
-    host_system_name = "local"
-    target_system_name = "local"
-    target_cpu = "x64_windows"
-    target_libc = "msvcrt"
-    compiler = "msvc-cl"
-    abi_version = "local"
-    abi_libc_version = "local"
-    cc_target_os = None
-    builtin_sysroot = None
-
-    cxx_builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-    ]
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "default_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "default_compile_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "default_compile_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "default_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "default_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    action_configs = [
-        assemble_action,
-        preprocess_assemble_action,
-        c_compile_action,
-        cpp_compile_action,
-        cpp_link_executable_action,
-        cpp_link_dynamic_library_action,
-        cpp_link_nodeps_dynamic_library_action,
-        cpp_link_static_library_action,
-    ]
-
-    msvc_link_env_feature = feature(
-        name = "msvc_link_env",
-        env_sets = [
-            env_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                env_entries = [env_entry(key = "LIB", value = "")],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/wd4117",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{unfiltered_compile_flags}"],
-                        iterate_over = "unfiltered_compile_flags",
-                        expand_if_available = "unfiltered_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{libopts}"],
-                        iterate_over = "libopts",
-                        expand_if_available = "libopts",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fastbuild_feature = feature(
-        name = "fastbuild",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["", "/INCREMENTAL:NO"],
-                    ),
-                ],
-            ),
-        ],
-        implies = ["generate_pdb_file"],
-    )
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    dbg_feature = feature(
-        name = "dbg",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["", "/INCREMENTAL:NO"],
-                    ),
-                ],
-            ),
-        ],
-        implies = ["generate_pdb_file"],
-    )
-
-    opt_feature = feature(
-        name = "opt",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/O2"])],
-            ),
-        ],
-        implies = ["frame_pointer"],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0601",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    msvc_compile_env_feature = feature(
-        name = "msvc_compile_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                env_entries = [env_entry(key = "INCLUDE", value = "")],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable_assertions",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    ignore_noisy_warnings_feature = feature(
-        name = "ignore_noisy_warnings",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [flag_group(flags = ["/ignore:4221"])],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    treat_warnings_as_errors_feature = feature(
-        name = "treat_warnings_as_errors",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/WX"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    frame_pointer_feature = feature(
-        name = "frame_pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Oy-"])],
-            ),
-        ],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_available = "output_file",
-                                expand_if_not_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_not_available = "output_preprocess_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    smaller_binary_feature = feature(
-        name = "smaller_binary",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ""),
-                    env_entry(key = "TMP", value = ""),
-                    env_entry(key = "TEMP", value = ""),
-                ],
-            ),
-        ],
-        implies = ["msvc_compile_env", "msvc_link_env"],
-    )
-
-    features = [
-        no_legacy_features_feature,
-        nologo_feature,
-        has_configured_linker_path_feature,
-        no_stripping_feature,
-        targets_windows_feature,
-        copy_dynamic_libraries_to_binary_feature,
-        default_compile_flags_feature,
-        msvc_env_feature,
-        msvc_compile_env_feature,
-        msvc_link_env_feature,
-        include_paths_feature,
-        preprocessor_defines_feature,
-        parse_showincludes_feature,
-        generate_pdb_file_feature,
-        shared_flag_feature,
-        linkstamps_feature,
-        output_execpath_flags_feature,
-        archiver_flags_feature,
-        input_param_flags_feature,
-        linker_subsystem_flag_feature,
-        user_link_flags_feature,
-        default_link_flags_feature,
-        linker_param_file_feature,
-        static_link_msvcrt_feature,
-        static_link_msvcrt_no_debug_feature,
-        dynamic_link_msvcrt_no_debug_feature,
-        static_link_msvcrt_debug_feature,
-        dynamic_link_msvcrt_debug_feature,
-        dbg_feature,
-        fastbuild_feature,
-        opt_feature,
-        frame_pointer_feature,
-        disable_assertions_feature,
-        determinism_feature,
-        treat_warnings_as_errors_feature,
-        smaller_binary_feature,
-        ignore_noisy_warnings_feature,
-        user_compile_flags_feature,
-        sysroot_feature,
-        unfiltered_compile_flags_feature,
-        compiler_output_flags_feature,
-        compiler_input_flags_feature,
-        def_file_feature,
-        windows_export_all_symbols_feature,
-        no_windows_export_all_symbols_feature,
-        supports_dynamic_linker_feature,
-        supports_interface_shared_libraries_feature,
-    ]
-
-    artifact_name_patterns = [
-        artifact_name_pattern(
-            category_name = "object_file",
-            prefix = "",
-            extension = ".obj",
-        ),
-        artifact_name_pattern(
-            category_name = "static_library",
-            prefix = "",
-            extension = ".lib",
-        ),
-        artifact_name_pattern(
-            category_name = "alwayslink_static_library",
-            prefix = "",
-            extension = ".lo.lib",
-        ),
-        artifact_name_pattern(
-            category_name = "executable",
-            prefix = "",
-            extension = ".exe",
-        ),
-        artifact_name_pattern(
-            category_name = "dynamic_library",
-            prefix = "",
-            extension = ".dll",
-        ),
-        artifact_name_pattern(
-            category_name = "interface_library",
-            prefix = "",
-            extension = ".if.lib",
-        ),
-    ]
-
-    make_variables = []
-
-    tool_paths = [
-        tool_path(name = "ar", path = ""),
-        tool_path(name = "ml", path = ""),
-        tool_path(name = "cpp", path = ""),
-        tool_path(name = "gcc", path = ""),
-        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-        tool_path(name = "ld", path = ""),
-        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-        tool_path(
-            name = "objcopy",
-            path = "wrapper/bin/msvc_nop.bat",
-        ),
-        tool_path(
-            name = "objdump",
-            path = "wrapper/bin/msvc_nop.bat",
-        ),
-        tool_path(
-            name = "strip",
-            path = "wrapper/bin/msvc_nop.bat",
-        ),
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = toolchain_identifier,
-        host_system_name = host_system_name,
-        target_system_name = target_system_name,
-        target_cpu = target_cpu,
-        target_libc = target_libc,
-        compiler = compiler,
-        abi_version = abi_version,
-        abi_libc_version = abi_libc_version,
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = builtin_sysroot,
-        cc_target_os = None,
-    )
-
-def _windows_msys_mingw_impl(ctx):
-    toolchain_identifier = "msys_x64_mingw"
-    host_system_name = "local"
-    target_system_name = "local"
-    target_cpu = "x64_windows"
-    target_libc = "mingw"
-    compiler = "mingw-gcc"
-    abi_version = "local"
-    abi_libc_version = "local"
-    cc_target_os = None
-    builtin_sysroot = None
-    action_configs = []
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        implies = ["copy_dynamic_libraries_to_binary"],
-        enabled = True,
-    )
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    gcc_env_feature = feature(
-        name = "gcc_env",
-        enabled = True,
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = "NOT_USED"),
-                ],
-            ),
-        ],
-    )
-
-    msys_mingw_flags = [
-    ]
-    msys_mingw_link_flags = [
-    ]
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
-            ),
-        ],
-    )
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    features = [
-        targets_windows_feature,
-        copy_dynamic_libraries_to_binary_feature,
-        gcc_env_feature,
-        default_compile_flags_feature,
-        default_link_flags_feature,
-        supports_dynamic_linker_feature,
-    ]
-
-    cxx_builtin_include_directories = [
-    ]
-
-    artifact_name_patterns = [
-        artifact_name_pattern(
-            category_name = "executable",
-            prefix = "",
-            extension = ".exe",
-        ),
-    ]
-
-    make_variables = []
-    tool_paths = [
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = toolchain_identifier,
-        host_system_name = host_system_name,
-        target_system_name = target_system_name,
-        target_cpu = target_cpu,
-        target_libc = target_libc,
-        compiler = compiler,
-        abi_version = abi_version,
-        abi_libc_version = abi_libc_version,
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = builtin_sysroot,
-        cc_target_os = cc_target_os,
-    )
-
-def _armeabi_impl(ctx):
-    toolchain_identifier = "stub_armeabi-v7a"
-    host_system_name = "armeabi-v7a"
-    target_system_name = "armeabi-v7a"
-    target_cpu = "armeabi-v7a"
-    target_libc = "armeabi-v7a"
-    compiler = "compiler"
-    abi_version = "armeabi-v7a"
-    abi_libc_version = "armeabi-v7a"
-    cc_target_os = None
-    builtin_sysroot = None
-    action_configs = []
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-    features = [supports_dynamic_linker_feature, supports_pic_feature]
-
-    cxx_builtin_include_directories = []
-    artifact_name_patterns = []
-    make_variables = []
-
-    tool_paths = [
-        tool_path(name = "ar", path = "/bin/false"),
-        tool_path(name = "compat-ld", path = "/bin/false"),
-        tool_path(name = "cpp", path = "/bin/false"),
-        tool_path(name = "dwp", path = "/bin/false"),
-        tool_path(name = "gcc", path = "/bin/false"),
-        tool_path(name = "gcov", path = "/bin/false"),
-        tool_path(name = "ld", path = "/bin/false"),
-        tool_path(name = "nm", path = "/bin/false"),
-        tool_path(name = "objcopy", path = "/bin/false"),
-        tool_path(name = "objdump", path = "/bin/false"),
-        tool_path(name = "strip", path = "/bin/false"),
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = toolchain_identifier,
-        host_system_name = host_system_name,
-        target_system_name = target_system_name,
-        target_cpu = target_cpu,
-        target_libc = target_libc,
-        compiler = compiler,
-        abi_version = abi_version,
-        abi_libc_version = abi_libc_version,
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = builtin_sysroot,
-        cc_target_os = cc_target_os,
-    )
-
-def _impl(ctx):
-    if ctx.attr.cpu == "armeabi-v7a":
-        return _armeabi_impl(ctx)
-    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
-        return _windows_msvc_impl(ctx)
-    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
-        return _windows_msys_mingw_impl(ctx)
-
-    tool_paths = [
-        tool_path(name = "ar", path = "/opt/rh/devtoolset-7/root/usr/bin/ar"),
-        tool_path(name = "ld", path = "/opt/rh/devtoolset-7/root/usr/bin/ld"),
-        tool_path(name = "cpp", path = "/opt/rh/devtoolset-7/root/usr/bin/cpp"),
-        tool_path(name = "gcc", path = "/opt/rh/devtoolset-7/root/usr/bin/gcc"),
-        tool_path(name = "dwp", path = "/opt/rh/devtoolset-7/root/usr/bin/dwp"),
-        tool_path(name = "gcov", path = "/opt/rh/devtoolset-7/root/usr/bin/gcov"),
-        tool_path(name = "nm", path = "/opt/rh/devtoolset-7/root/usr/bin/nm"),
-        tool_path(name = "objcopy", path = "/opt/rh/devtoolset-7/root/usr/bin/objcopy"),
-        tool_path(name = "objdump", path = "/opt/rh/devtoolset-7/root/usr/bin/objdump"),
-        tool_path(name = "strip", path = "/opt/rh/devtoolset-7/root/usr/bin/strip"),
-    ]
-
-    cxx_builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-    ]
-
-    action_configs = []
-
-    compile_flags = [
-        "-U_FORTIFY_SOURCE",
-        "-fstack-protector",
-        "-Wall",
-        "-Wunused-but-set-parameter",
-        "-Wno-free-nonheap-object",
-        "-fno-omit-frame-pointer",
-    ]
-
-    dbg_compile_flags = [
-        "-g",
-    ]
-
-    opt_compile_flags = [
-        "-g0",
-        "-O2",
-        "-D_FORTIFY_SOURCE=1",
-        "-DNDEBUG",
-        "-ffunction-sections",
-        "-fdata-sections",
-    ]
-
-    cxx_flags = [
-        "-std=c++0x",
-    ]
-
-    link_flags = [
-        "-fuse-ld=gold",
-        "-Wl,-no-as-needed",
-        "-Wl,-z,relro,-z,now",
-        "-B/opt/rh/devtoolset-7/root/usr/bin",
-        "-pass-exit-codes",
-        "-lstdc++",
-        "-lm",
-    ]
-
-    opt_link_flags = [
-        "-Wl,--gc-sections",
-    ]
-
-    unfiltered_compile_flags = [
-        "-fno-canonical-system-headers",
-        "-Wno-builtin-macro-redefined",
-        "-D__DATE__=\"redacted\"",
-        "-D__TIMESTAMP__=\"redacted\"",
-        "-D__TIME__=\"redacted\"",
-    ]
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        implies = ["copy_dynamic_libraries_to_binary"],
-        enabled = True,
-    )
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    gcc_env_feature = feature(
-        name = "gcc_env",
-        enabled = True,
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = "NOT_USED"),
-                ],
-            ),
-        ],
-    )
-
-    windows_features = [
-        targets_windows_feature,
-        copy_dynamic_libraries_to_binary_feature,
-        gcc_env_feature,
-    ]
-
-    coverage_feature = feature(
-        name = "coverage",
-        provides = ["profile"],
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(flags = ["--coverage"]),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [
-                    flag_group(flags = ["--coverage"]),
-                ],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(
-        name = "supports_pic",
-        enabled = True,
-    )
-    supports_start_end_lib_feature = feature(
-        name = "supports_start_end_lib",
-        enabled = True,
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
-                with_features = [with_feature_set(features = ["dbg"])],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    dbg_feature = feature(name = "dbg")
-
-    opt_feature = feature(name = "opt")
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fdo_optimize_feature = feature(
-        name = "fdo_optimize",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-use=%{fdo_profile_path}",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
-            ),
-        ],
-    )
-
-    features = [
-        supports_pic_feature,
-        supports_start_end_lib_feature,
-        coverage_feature,
-        default_compile_flags_feature,
-        default_link_flags_feature,
-        fdo_optimize_feature,
-        supports_dynamic_linker_feature,
-        dbg_feature,
-        opt_feature,
-        user_compile_flags_feature,
-        sysroot_feature,
-        unfiltered_compile_flags_feature,
-    ]
-
-    artifact_name_patterns = [
-    ]
-
-    make_variables = []
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = "linux_gnu_x86",
-        host_system_name = "i686-unknown-linux-gnu",
-        target_system_name = "x86_64-unknown-linux-gnu",
-        target_cpu = "k8",
-        target_libc = "glibc_2.19",
-        compiler = "gcc",
-        abi_version = "gcc",
-        abi_libc_version = "glibc_2.19",
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = "",
-        cc_target_os = None,
-    )
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True),
-        "compiler": attr.string(),
-    },
-    provides = [CcToolchainConfigInfo],
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc b/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
deleted file mode 100755
index 237c8ce1817..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
+++ /dev/null
@@ -1 +0,0 @@
-int main() {}
diff --git a/third_party/toolchains/preconfig/centos6/py/BUILD b/third_party/toolchains/preconfig/centos6/py/BUILD
deleted file mode 100755
index b8de94c9e2c..00000000000
--- a/third_party/toolchains/preconfig/centos6/py/BUILD
+++ /dev/null
@@ -1,174 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/boolobject.h",
-        "python_include/bufferobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cStringIO.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/cobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intobject.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig-64.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymactoolbox.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/stringobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/timefuncs.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/opt/rh/python27/root/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyconfig-64.h" "$(@D)/python_include/pyconfig-64.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/py/WORKSPACE b/third_party/toolchains/preconfig/centos6/py/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/centos6/py/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/centos6/py3/BUILD b/third_party/toolchains/preconfig/centos6/py3/BUILD
deleted file mode 100755
index ac17c471427..00000000000
--- a/third_party/toolchains/preconfig/centos6/py3/BUILD
+++ /dev/null
@@ -1,181 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/accu.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/bltinmodule.h",
-        "python_include/boolobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/dynamic_annotations.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/fileutils.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/namespaceobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/odictobject.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/osmodule.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pyatomic.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pydtrace.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pyhash.h",
-        "python_include/pylifecycle.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymacro.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrhex.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/pytime.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/typeslots.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/usr/local/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/local/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/local/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/local/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/local/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/local/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/local/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/local/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/local/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/local/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/local/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/local/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/local/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/local/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/local/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/local/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/local/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/local/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/local/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/local/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/local/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/local/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/local/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/local/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/local/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/local/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/local/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/local/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/local/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/local/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/local/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/local/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/local/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/local/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/local/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/local/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/local/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/local/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/local/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/local/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/local/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/local/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/local/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/local/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/local/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/local/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/local/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/local/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/local/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/local/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/local/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/local/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/local/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/local/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/local/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "/usr/local/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/local/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/local/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/local/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/local/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/local/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/local/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/local/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/local/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/local/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/local/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/local/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "/usr/local/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/local/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/local/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/local/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/local/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/local/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/local/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/local/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/local/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/local/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/local/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/local/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/local/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/local/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/local/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/local/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/local/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/local/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/local/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/local/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/local/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/local/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/local/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/local/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/local/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/local/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/local/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/local/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/local/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/local/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/local/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/local/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/local/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/py3/WORKSPACE b/third_party/toolchains/preconfig/centos6/py3/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/centos6/py3/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD b/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
deleted file mode 100755
index 3e96ceb3324..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
+++ /dev/null
@@ -1,62 +0,0 @@
-# NVIDIA TensorRT
-# A high-performance deep learning inference optimizer and runtime.
-
-licenses(["notice"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "tensorrt_headers",
-    hdrs = [
-        "tensorrt/include/tensorrt_config.h",
-        ":tensorrt_include",
-    ],
-    include_prefix = "third_party/tensorrt",
-    strip_include_prefix = "tensorrt/include",
-)
-
-cc_library(
-    name = "tensorrt",
-    srcs = [":tensorrt_lib"],
-    copts = cuda_default_copts(),
-    data = [":tensorrt_lib"],
-    linkstatic = 1,
-    deps = [
-        ":tensorrt_headers",
-        "@local_config_cuda//cuda",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "tensorrt_lib",
-    outs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
-    cmd = """cp -f "/usr/lib64/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \
-cp -f "/usr/lib64/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """,
-)
-
-genrule(
-    name = "tensorrt_include",
-    outs = [
-        "tensorrt/include/NvInfer.h",
-        "tensorrt/include/NvUtils.h",
-        "tensorrt/include/NvInferPlugin.h",
-    ],
-    cmd = """cp -f "/usr/include/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \
-cp -f "/usr/include/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \
-cp -f "/usr/include/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/LICENSE b/third_party/toolchains/preconfig/centos6/tensorrt5/LICENSE
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE
deleted file mode 100644
index ce47f14b91b..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
-workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
deleted file mode 100755
index 527be938341..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
+++ /dev/null
@@ -1,5 +0,0 @@
-# Build configurations for TensorRT.
-
-def if_tensorrt(if_true, if_false = []):
-    """Tests whether TensorRT was enabled during the configure process."""
-    return if_true
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h b/third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h
deleted file mode 100644
index 02a166f4cd1..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-#define TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-
-#define TF_TENSORRT_VERSION "5"
-
-#endif  // TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index df0ac112f4a..397b36d9f3b 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -10,4 +10,5 @@ container_digests = {
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:23db3de806535c9d26170567ba55cf653e503057345a0e9c129124c08ea118a3",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
+    "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
deleted file mode 100755
index 95ec02dd868..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1282 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.0"],
-    data = ["cuda/lib/libcudart.so.10.0"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10.0"],
-    data = ["cuda/lib/libcublas.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10.0"],
-    data = ["cuda/lib/libcusolver.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10.0"],
-    data = ["cuda/lib/libcufft.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10.0"],
-    data = ["cuda/lib/libcurand.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.0"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10.0"],
-    data = ["cuda/lib/libcusparse.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cublas.h",
-        "cuda/include/cublasXt.h",
-        "cuda/include/cublas_api.h",
-        "cuda/include/cublas_v2.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvblas.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/type_traits.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/GL/gl.h",
-        "cuda/extras/CUPTI/include/GL/glew.h",
-        "cuda/extras/CUPTI/include/GL/glext.h",
-        "cuda/extras/CUPTI/include/GL/glu.h",
-        "cuda/extras/CUPTI/include/GL/glut.h",
-        "cuda/extras/CUPTI/include/GL/glx.h",
-        "cuda/extras/CUPTI/include/GL/glxext.h",
-        "cuda/extras/CUPTI/include/GL/wglew.h",
-        "cuda/extras/CUPTI/include/GL/wglext.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/openmp/ompt.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.0",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10.0",
-        "cuda/lib/libcusolver.so.10.0",
-        "cuda/lib/libcurand.so.10.0",
-        "cuda/lib/libcufft.so.10.0",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.0",
-        "cuda/lib/libcusparse.so.10.0",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" "$(location cuda/lib/libcudart.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" "$(location cuda/lib/libcublas.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" "$(location cuda/lib/libcusolver.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" "$(location cuda/lib/libcurand.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" "$(location cuda/lib/libcufft.so.10.0)" && \
-cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" "$(location cuda/lib/libcupti.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" "$(location cuda/lib/libcusparse.so.10.0)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/bin2c",
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/crt/prelink.stub",
-        "cuda/bin/cuda-gdb",
-        "cuda/bin/cuda-gdbserver",
-        "cuda/bin/cuda-memcheck",
-        "cuda/bin/cudafe++",
-        "cuda/bin/cuobjdump",
-        "cuda/bin/fatbinary",
-        "cuda/bin/gpu-library-advisor",
-        "cuda/bin/nvcc",
-        "cuda/bin/nvcc.profile",
-        "cuda/bin/nvdisasm",
-        "cuda/bin/nvlink",
-        "cuda/bin/nvprof",
-        "cuda/bin/nvprune",
-        "cuda/bin/ptxas",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index 72472e4c224..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,64 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
deleted file mode 100644
index 72a7cf77346..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
-
-#define TF_CUDA_VERSION "10.0"
-#define TF_CUDA_LIB_VERSION "10.0"
-#define TF_CUDNN_VERSION "7"
-
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
deleted file mode 100755
index 399efccfdad..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+++ /dev/null
@@ -1,172 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    builtin_include_directories = [
-        "/usr/include/c++/4.8",
-        "/usr/include/x86_64-linux-gnu/c++/4.8",
-        "/usr/include/c++/4.8/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/include",
-    ],
-    cpu = "local",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_config = ":cc-compiler-local-darwin",
-    toolchain_identifier = "local_darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    builtin_include_directories = [
-        "/usr/include/c++/4.8",
-        "/usr/include/x86_64-linux-gnu/c++/4.8",
-        "/usr/include/c++/4.8/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/include",
-    ],
-    cpu = "darwin",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = ":windows_msvc_wrapper_files",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":windows_msvc_wrapper_files",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-windows-config",
-    toolchain_identifier = "local_windows",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    builtin_include_directories = [
-        "/usr/include/c++/4.8",
-        "/usr/include/x86_64-linux-gnu/c++/4.8",
-        "/usr/include/c++/4.8/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/include",
-    ],
-    cpu = "x64_windows",
-    msvc_cl_path = "msvc_not_used",
-    msvc_env_include = "msvc_not_used",
-    msvc_env_lib = "msvc_not_used",
-    msvc_env_path = "msvc_not_used",
-    msvc_env_tmp = "msvc_not_used",
-    msvc_lib_path = "msvc_not_used",
-    msvc_link_path = "msvc_not_used",
-    msvc_ml_path = "msvc_not_used",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
deleted file mode 100755
index f7575bbe28e..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1485 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
deleted file mode 100755
index c49b20f2eb9..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ /dev/null
@@ -1,264 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '10.0'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ["c++11"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in nvcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g -G')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-  nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
deleted file mode 100755
index 72354b133a9..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-NVCC_VERSION = '10.0'
-NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise Error('No source files found for cuda compilation.')
-
-  out_file = [ f for f in argv if f.startswith('/Fo') ]
-  if len(out_file) != 1:
-    raise Error('Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ["-I " + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
-
-  m_options = ["-m64"]
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
-                          stdout=sys.stdout,
-                          stderr=sys.stderr,
-                          env=os.environ.copy(),
-                          shell=True)
-  proc.wait()
-  return proc.returncode
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))
-                             and not flag.startswith(('-nvcc_options'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
deleted file mode 100755
index 460c879d32f..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+++ /dev/null
@@ -1,176 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/accu.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/bltinmodule.h",
-        "python_include/boolobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/dynamic_annotations.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/fileutils.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/namespaceobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pyatomic.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pyhash.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymacro.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/pytime.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/typeslots.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
deleted file mode 100755
index 88980d1014a..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# NVIDIA TensorRT
-# A high-performance deep learning inference optimizer and runtime.
-
-licenses(["notice"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "tensorrt_headers",
-    hdrs = [
-        "tensorrt/include/tensorrt_config.h",
-        ":tensorrt_include",
-    ],
-    include_prefix = "third_party/tensorrt",
-    strip_include_prefix = "tensorrt/include",
-)
-
-cc_library(
-    name = "tensorrt",
-    srcs = [":tensorrt_lib"],
-    copts = cuda_default_copts(),
-    data = [":tensorrt_lib"],
-    linkstatic = 1,
-    deps = [
-        ":tensorrt_headers",
-        "@local_config_cuda//cuda",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "tensorrt_lib",
-    outs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
-    cmd = """cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \
-cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """,
-)
-
-genrule(
-    name = "tensorrt_include",
-    outs = [
-        "tensorrt/include/NvInfer.h",
-        "tensorrt/include/NvUtils.h",
-        "tensorrt/include/NvInferPlugin.h",
-    ],
-    cmd = """cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \
-cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \
-cp -f "/usr/include/x86_64-linux-gnu/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/LICENSE b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/LICENSE
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
deleted file mode 100644
index ce47f14b91b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
-workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
deleted file mode 100755
index 527be938341..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
+++ /dev/null
@@ -1,5 +0,0 @@
-# Build configurations for TensorRT.
-
-def if_tensorrt(if_true, if_false = []):
-    """Tests whether TensorRT was enabled during the configure process."""
-    return if_true
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h
deleted file mode 100644
index 02a166f4cd1..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-#define TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-
-#define TF_TENSORRT_VERSION "5"
-
-#endif  // TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
deleted file mode 100755
index 81bd7358254..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
+++ /dev/null
@@ -1,209 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# Point both runtimes to the same python binary to ensure we always
-# use the python binary specified by ./configure.py script.
-load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
-
-py_runtime(
-    name = "py2_runtime",
-    interpreter_path = "/usr/bin/python3",
-    python_version = "PY2",
-)
-
-py_runtime(
-    name = "py3_runtime",
-    interpreter_path = "/usr/bin/python3",
-    python_version = "PY3",
-)
-
-py_runtime_pair(
-    name = "py_runtime_pair",
-    py2_runtime = ":py2_runtime",
-    py3_runtime = ":py3_runtime",
-)
-
-toolchain(
-    name = "py_toolchain",
-    toolchain = ":py_runtime_pair",
-    toolchain_type = "@bazel_tools//tools/python:toolchain_type",
-)
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/accu.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/bltinmodule.h",
-        "python_include/boolobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/dynamic_annotations.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/fileutils.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/namespaceobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/odictobject.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/osmodule.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pyatomic.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pydtrace.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pyhash.h",
-        "python_include/pylifecycle.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymacro.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrhex.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/pytime.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/typeslots.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/usr/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "/usr/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "/usr/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/win/BUILD b/third_party/toolchains/preconfig/win/BUILD
index 519d8e5110d..d612636a834 100644
--- a/third_party/toolchains/preconfig/win/BUILD
+++ b/third_party/toolchains/preconfig/win/BUILD
@@ -14,8 +14,17 @@ platform(
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:windows",
     ],
-    exec_properties = {
-        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:5e91ddd99345204cd8da2e687d312eb64b3916f257023fd1b651b3dabefd9286",
-        "OSFamily": "Windows",
-    },
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:5e91ddd99345204cd8da2e687d312eb64b3916f257023fd1b651b3dabefd9286"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "win2019"
+        }
+        """,
 )
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
index 4e315e8f086..94ac82d638c 100644
--- a/third_party/toolchains/preconfig/win_1803/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -14,10 +14,19 @@ platform(
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:windows",
     ],
-    exec_properties = {
-        "container-image": "docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
-        "OSFamily": "Windows",
-    },
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        """,
 )
 
 platform(
@@ -26,9 +35,21 @@ platform(
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:windows",
     ],
-    exec_properties = {
-        "container-image": "",
-        "sandbox": "none",
-        "OSFamily": "Windows",
-    },
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: ""
+        }
+        properties:{
+          name: "sandbox"
+          value: "none"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        """,
 )
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 2c2bcfb59b3..4945db280b6 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -1,8 +1,15 @@
 """Configurations of RBE builds used with remote config."""
 
-load("//third_party/toolchains/remote_config:rbe_config.bzl", "tensorflow_rbe_config")
+load("//third_party/toolchains/remote_config:rbe_config.bzl", "tensorflow_rbe_config", "tensorflow_rbe_win_config")
 
 def initialize_rbe_configs():
+    tensorflow_rbe_config(
+        name = "ubuntu16.04-manylinux2010-py3",
+        os = "ubuntu16.04-manylinux2010",
+        python_version = "3",
+        compiler = "",
+    )
+
     tensorflow_rbe_config(
         name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1",
         compiler = "/dt7/usr/bin/gcc",
@@ -22,3 +29,8 @@ def initialize_rbe_configs():
         python_version = "3",
         rocm_version = "2.5",  # Any version will do.
     )
+
+    tensorflow_rbe_win_config(
+        name = "windows_py37",
+        python_bin_path = "C:/Python37/python.exe",
+    )
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 8813da19e00..27e0cceee1c 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -3,6 +3,12 @@
 load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
 
 containers = {
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010.
+    "ubuntu16.04-manylinux2010": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-ubuntu16.04-manylinux2010",
+        "digest": container_digests["ubuntu16.04-manylinux2010"],
+    },
 
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010.
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": {
@@ -17,4 +23,9 @@ containers = {
         "repository": "tensorflow-testing/nosla-rocm-ubuntu16.04",
         "digest": container_digests["rocm-ubuntu16.04"],
     },
+    "windows-1803": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/tf-win-rbe",
+        "digest": container_digests["windows-1803"],
+    },
 }
diff --git a/third_party/toolchains/remote_config/rbe_config.bzl b/third_party/toolchains/remote_config/rbe_config.bzl
index ca186f094a7..6709cad4eb3 100644
--- a/third_party/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/toolchains/remote_config/rbe_config.bzl
@@ -13,9 +13,6 @@ def _container_image_uri(container_name):
     return "docker://%s/%s@%s" % (container["registry"], container["repository"], container["digest"])
 
 def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = None, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None, sysroot = None):
-    if cuda_version == None and rocm_version == None:
-        fail("Neither cuda_version nor rocm_version specified. You need to specify exactly one.")
-
     if cuda_version != None and rocm_version != None:
         fail("Specifying both cuda_version and rocm_version is not supported.")
 
@@ -64,7 +61,8 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
 
         remote_platform_configure(
             name = "%s_config_platform" % name,
-            container_image = container_image,
+            platform = "linux",
+            platform_exec_properties = exec_properties,
         )
 
         remote_python_configure(
@@ -107,7 +105,8 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
 
         remote_platform_configure(
             name = "%s_config_platform" % name,
-            container_image = container_image,
+            platform = "linux",
+            platform_exec_properties = exec_properties,
         )
 
         remote_python_configure(
@@ -121,5 +120,43 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = No
             environ = env,
             exec_properties = exec_properties,
         )
+    elif python_version != None:
+        container_image = _container_image_uri(os)
+        exec_properties = {
+            "container-image": container_image,
+            "Pool": "default",
+        }
+
+        remote_python_configure(
+            name = "%s_config_python" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+    else:
+        fail("Neither cuda_version, rocm_version nor python_version specified.")
+
+def _tensorflow_rbe_win_config(name, python_bin_path, container_name = "windows-1803"):
+    container_image = _container_image_uri(container_name)
+    exec_properties = {
+        "container-image": container_image,
+        "OSFamily": "Windows",
+    }
+
+    env = {
+        "PYTHON_BIN_PATH": python_bin_path,
+    }
+
+    remote_platform_configure(
+        name = "%s_config_platform" % name,
+        platform = "windows",
+        platform_exec_properties = exec_properties,
+    )
+
+    remote_python_configure(
+        name = "%s_config_python" % name,
+        environ = env,
+        exec_properties = exec_properties,
+    )
 
 tensorflow_rbe_config = _tensorflow_rbe_config
+tensorflow_rbe_win_config = _tensorflow_rbe_win_config