diff --git a/.bazelrc b/.bazelrc
index 1b9f5e87c6b..e765c302c28 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -461,12 +461,12 @@ build:rbe_linux_cuda11.0_nvcc_py3.6 --config=rbe_linux_cuda11.0_nvcc_base --repo
 build:rbe_linux_cuda11.0_nvcc_py3.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.7"
 build:rbe_linux_cuda11.0_nvcc_py3.8 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.8"
 
-# Map default to CUDA 10.1.
+# Map default to CUDA 11 for PY35 and greater.
 build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda10.1_nvcc_py2.7
-build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda10.1_nvcc_py3.5
-build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda10.1_nvcc_py3.6
-build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda10.1_nvcc_py3.7
-build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda10.1_nvcc_py3.8
+build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda11.0_nvcc_py3.5
+build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda11.0_nvcc_py3.6
+build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda11.0_nvcc_py3.7
+build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda11.0_nvcc_py3.8
 
 # Deprecated configs that people might still use.
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_nvcc_py36
@@ -583,9 +583,9 @@ build:release_cpu_macos --config=avx_linux
 build:release_gpu_common --config=release_common
 build:release_gpu_common --config=cuda
 build:release_gpu_common --config=tensorrt
-build:release_gpu_common --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-10.1"
-build:release_gpu_common --action_env=TF_CUDA_VERSION="10"
-build:release_gpu_common --action_env=TF_CUDNN_VERSION="7"
+build:release_gpu_common --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-11.0"
+build:release_gpu_common --action_env=TF_CUDA_VERSION="11"
+build:release_gpu_common --action_env=TF_CUDNN_VERSION="8"
 build:release_gpu_common --action_env=TF_NEED_TENSORRT="1"
 build:release_gpu_common --action_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_37,sm_52,sm_60,sm_61,compute_70"
 build:release_gpu_common --action_env=TENSORRT_INSTALL_PATH="/usr/local/tensorrt"
@@ -595,8 +595,7 @@ build:release_gpu_common --action_env=GCC_HOST_COMPILER_PATH="/usr/bin/gcc-5"
 
 build:release_gpu_linux --config=release_gpu_common
 build:release_gpu_linux --config=avx_linux
-build:release_gpu_linux --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain
-
+build:release_gpu_linux --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain
 build:release_windows_common --config=release_common
 build:release_windows_common --define=no_tensorflow_py_deps=true
 build:release_windows_common --announce_rc
diff --git a/RELEASE.md b/RELEASE.md
index d4b5b27630e..7057657c340 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -81,6 +81,12 @@
       server and set `dispatcher_fault_tolerance=True`. The dispatcher will
       store its state to `work_dir`, so that on restart it can continue from its
       previous state after restart.
+    * Added tf.data service support for sharing dataset graphs via shared
+      filesystem instead of over RPC. This reduces load on the dispatcher,
+      improving performance of distributing datasets. For this to work, the
+      dispatcher's `work_dir` must be accessible from workers. If the worker
+      fails to read from the `work_dir`, it falls back to using RPC for dataset
+      graph transfer.
     * Added optional `exclude_cols` parameter to CsvDataset. This parameter is
       the complement of `select_cols`; at most one of these should be specified.
     * We have implemented an optimization which reorders data-discarding
@@ -88,6 +94,7 @@
       dataset when it is safe to do so. The optimization can be disabled via
       the `experimental_optimization.reorder_data_discarding_ops` dataset
       option.
+    * `tf.data.Options` were previously immutable and can now be overriden.
 * `tf.image`:
     * Added deterministic `tf.image.stateless_random_*` functions for each
       `tf.image.random_*` function. Added a new op
@@ -106,7 +113,8 @@
       * Error messages when Functional API construction goes wrong (and when ops cannot be converted to Keras layers automatically) should be clearer and easier to understand.
     * `Optimizer.minimize` can now accept a loss `Tensor` and a `GradientTape`
       as an alternative to accepting a `callable` loss.
-    * Added `beta` parameter to FTRL optimizer to match paper.
+    * Added `beta` hyperparameter to FTRL optimizer classes (Keras and others)
+      to match FTRL paper (https://research.google.com/pubs/archive/41159.pdf).
     * Added `mobilenet_v3` to keras application model.
     * `Optimizer.__init__` now accepts a `gradient_aggregator` to allow for
       customization of how gradients are aggregated across devices, as well as
@@ -155,6 +163,14 @@
     * <ADD RELEASE NOTES HERE>
 *   Tracing and Debugging:
     * <ADD RELEASE NOTES HERE>
+*   `tf.train.Checkpoint`:
+    * Now accepts a `root` argument in the initialization, which generates a
+      checkpoint with a root object. This allows users to create a `Checkpoint`
+      object that is compatible with Keras `model.save_weights()` and
+      `model.load_weights`. The checkpoint is also compatible with the
+      checkpoint saved in the `variables/` folder in the SavedModel.
+    * When restoring, `save_path` can be a path to a SavedModel. The function
+      will automatically find the checkpoint in the SavedModel.
 *   Other:
     * We have replaced uses of "whitelist" and "blacklist" with "allowlist"
   and "denylist" where possible. Please see 
@@ -251,6 +267,7 @@ stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
     * Mutable tables now restore checkpointed values when loaded from SavedModel.
   * GPU
     * TF 2.3 includes PTX kernels only for [compute capability](https://developer.nvidia.com/cuda-gpus) 7.0 to reduce the TF pip binary size.  Earlier releases included PTX for a variety of older compute capabilities.
+    * Remove environmental variable `TF_USE_CUDNN`.
   * Others
     * Retain parent namescope for ops added inside `tf.while_loop`/`tf.cond`/`tf.switch_case`.
     * Update `tf.vectorized_map` to support vectorizing `tf.while_loop` and TensorList operations.
@@ -1582,6 +1599,7 @@ Yuan (Terry) Tang, Yuchen Ying, Yves-Noel Weweler, zhangyujing, zjjott, zyeric,
         color palette of the frame. This has been fixed now
     *   image.resize now considers proper pixel centers and has new kernels
         (incl. anti-aliasing).
+    *   Added an isotonic regression solver (tf.nn.isotonic_regression).
 *   Performance
     *   Turn on MKL-DNN contraction kernels by default. MKL-DNN dynamically
         dispatches the best kernel implementation based on CPU vector
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index fb5a0d250bb..9d8032aca52 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -58,9 +58,9 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-filegroup(
+cc_library(
     name = "pywrap_required_hdrs",
-    srcs = [
+    textual_hdrs = [
         "c_api_internal.h",
         "c_api_macros.h",
         "conversion_macros.h",
@@ -220,6 +220,7 @@ cc_library(
     name = "logging",
     srcs = ["logging.cc"],
     hdrs = ["logging.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":c_api_macros",
         "//tensorflow/core/platform:logging",
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 47452c245dc..ce2e2382309 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -240,6 +240,7 @@ tf_cuda_cc_test(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:array_grad",
         "//tensorflow/c/experimental/gradients:math_grad",
         "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/cc/profiler",
@@ -255,6 +256,72 @@ tf_cuda_cc_test(
     ],
 )
 
+cc_library(
+    name = "mnist_gradients_testutil",
+    srcs = [
+        "mnist_gradients_testutil.cc",
+    ],
+    hdrs = [
+        "mnist_gradients_testutil.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_context",
+        ":abstract_operation",
+        ":abstract_tensor_handle",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":tape",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/core/common_runtime/eager:attr_builder",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "mnist_gradients_test",
+    size = "small",
+    srcs = [
+        "mnist_gradients_test.cc",
+    ],
+    args = ["--heap_check=local"],
+    extra_copts = tfe_xla_copts(),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_test_util",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":mnist_gradients_testutil",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/gradients:nn_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "abstract_tensor_handle",
     hdrs = ["abstract_tensor_handle.h"],
diff --git a/tensorflow/c/eager/c_api_remote_function_test.cc b/tensorflow/c/eager/c_api_remote_function_test.cc
index d3f9826635c..a9bbd5b694f 100644
--- a/tensorflow/c/eager/c_api_remote_function_test.cc
+++ b/tensorflow/c/eager/c_api_remote_function_test.cc
@@ -30,18 +30,26 @@ TEST(CAPI, RemoteExecuteSilentCopiesAsyncFunc) {
   TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/true,
                                     /*heavy_load_on_streaming_rpc=*/false);
 }
+TEST(CAPI, RemoteExecuteSilentCopiesFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/false, /*remote=*/true,
+                                    /*heavy_load_on_streaming_rpc=*/false,
+                                    /*remote_func_outputs=*/true);
+}
+TEST(CAPI, RemoteExecuteSilentCopiesAsyncFuncRemoteOutputs) {
+  TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/true,
+                                    /*heavy_load_on_streaming_rpc=*/false,
+                                    /*remote_func_outputs=*/true);
+}
 TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFunc) {
   TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/false,
                                     /*heavy_load_on_streaming_rpc=*/false);
 }
-// TODO(b/162618595): Enable this test once we remove the check of remote
-// outputs in ProcessFunctionLibraryRuntime.
-TEST(CAPI, DISABLED_RemoteExecuteSilentCopiesLocalFuncRemoteOutputs) {
+TEST(CAPI, RemoteExecuteSilentCopiesLocalFuncRemoteOutputs) {
   TestRemoteExecuteSilentCopiesFunc(/*async=*/false, /*remote=*/false,
                                     /*heavy_load_on_streaming_rpc=*/false,
                                     /*remote_func_outputs=*/true);
 }
-TEST(CAPI, DISABLED_RemoteExecuteSilentCopiesLocalAsyncFuncRemoteOutputs) {
+TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncRemoteOutputs) {
   TestRemoteExecuteSilentCopiesFunc(/*async=*/true, /*remote=*/false,
                                     /*heavy_load_on_streaming_rpc=*/false,
                                     /*remote_func_outputs=*/true);
diff --git a/tensorflow/c/eager/c_api_remote_test_util.cc b/tensorflow/c/eager/c_api_remote_test_util.cc
index 0ae5b74553a..159fa442a73 100644
--- a/tensorflow/c/eager/c_api_remote_test_util.cc
+++ b/tensorflow/c/eager/c_api_remote_test_util.cc
@@ -169,6 +169,13 @@ void TestRemoteExecuteSilentCopies(bool async, bool remote, bool func,
     ASSERT_TRUE(remote_arg->HasLocalMirror(nullptr));
   }
 
+  if (remote_func_outputs) {
+    const string backing_device =
+        TFE_TensorHandleBackingDeviceName(retvals[0], status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    EXPECT_EQ(backing_device, task2_name);
+  }
+
   auto* retval_task0 = TFE_TensorHandleCopyToDevice(
       retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 192f10533a6..fd68866f502 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -102,6 +102,32 @@ TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
   return th;
 }
 
+TFE_TensorHandle* TestTensorHandleWithDimsFloat(TFE_Context* ctx, float data[],
+                                                int64_t dims[], int num_dims) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, &dims[0], num_dims, status);
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* TestTensorHandleWithDimsInt(TFE_Context* ctx, int data[],
+                                              int64_t dims[], int num_dims) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_INT32, &dims[0], num_dims, status);
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx) {
   constexpr int64_t dims[] = {100, 100};
   constexpr int num_elements = dims[0] * dims[1];
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index fcf407aa9c3..2f77ae5cf44 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -40,6 +40,14 @@ TFE_TensorHandle* TestMatrixTensorHandleWithInput(TFE_Context* ctx,
                                                   float data[], int64_t dims[],
                                                   int num_dims);
 
+// Get a Matrix TensorHandle with given float values and dimensions
+TFE_TensorHandle* TestTensorHandleWithDimsFloat(TFE_Context* ctx, float data[],
+                                                int64_t dims[], int num_dims);
+
+// Get a Matrix TensorHandle with given int values and dimensions
+TFE_TensorHandle* TestTensorHandleWithDimsInt(TFE_Context* ctx, int data[],
+                                              int64_t dims[], int num_dims);
+
 // Return a tensor handle containing a 100x100 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle100x100(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 7bda3aed76d..9d064039141 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -85,7 +85,11 @@ class GraphOperation : public TracingOperation {
       return errors::FailedPrecondition(
           "GraphOperation::Reset must be called before calling SetOpName.");
     }
-    op_.reset(TF_NewOperation(g_, op_type_.c_str(), op_name));
+    // TODO(b/145674566): We use Graph::NewName to get a unique name here but
+    // this may not be consistent with python's naming policy.
+    mutex_lock l(g_->mu);
+    op_.reset(new TF_OperationDescription(g_, op_type_.c_str(),
+                                          g_->graph.NewName(op_name).c_str()));
     return Status::OK();
   }
   const string& Name() const override { return op_type_; }
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index c669ff4cf96..7b3a497a0c5 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -557,7 +557,7 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
     auto* add_op = TF_NewAbstractOp(graph_ctx);
     TF_AbstractOpSetOpType(add_op, "Add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    TF_AbstractOpSetOpName(add_op, "my_add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_AbstractTensor* inputs[2] = {arg0, arg1};
     TF_OutputList* add_outputs = TF_NewOutputList();
@@ -579,7 +579,7 @@ TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
     auto* add_op = TF_NewAbstractOp(graph_ctx);
     TF_AbstractOpSetOpType(add_op, "Add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    TF_AbstractOpSetOpName(add_op, "my_add", s);
     ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
     TF_AbstractTensor* inputs[2] = {arg1, arg1};
     TF_OutputList* add_outputs = TF_NewOutputList();
diff --git a/tensorflow/c/eager/gradients.cc b/tensorflow/c/eager/gradients.cc
index 39cadd421e2..9bcd0d0fea0 100644
--- a/tensorflow/c/eager/gradients.cc
+++ b/tensorflow/c/eager/gradients.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/eager/gradients.h"
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients_internal.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
@@ -23,25 +24,97 @@ limitations under the License.
 namespace tensorflow {
 namespace gradients {
 
-Status GradientRegistry::Register(const string& op_name,
-                                  GradientFunctionFactory factory) {
+namespace {
+Status ZerosLike(AbstractContext* ctx, AbstractTensorHandle* t,
+                 AbstractTensorHandle** result) {
+  AbstractOperationPtr op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(op->Reset("ZerosLike", /*raw_device_name=*/nullptr));
+  if (isa<tracing::TracingOperation>(op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
+        absl::StrCat("ZerosLike", ToId(t)).c_str()));
+  }
+  TF_RETURN_IF_ERROR(op->AddInput(t));
+  int num_outputs = 1;
+  std::vector<AbstractTensorHandle*> outputs(num_outputs);
+  TF_RETURN_IF_ERROR(
+      op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs));
+  *result = outputs[0];
+  return Status::OK();
+}
+}  // namespace
+
+class IncomingGradientsImpl : public IncomingGradients {
+ public:
+  explicit IncomingGradientsImpl(
+      absl::Span<AbstractTensorHandle* const> grad_inputs, Context* ctx,
+      DefaultGradientFunction* default_gradients)
+      : grad_inputs_(grad_inputs),
+        ctx_(ctx),
+        default_gradients_(default_gradients) {}
+  AbstractTensorHandle* operator[](int i) const override {
+    return default_gradients_->get(ctx_, grad_inputs_, i);
+  }
+  size_t size() const override { return grad_inputs_.size(); }
+
+ private:
+  absl::Span<AbstractTensorHandle* const> grad_inputs_;
+  Context* ctx_;
+  DefaultGradientFunction* default_gradients_;
+};
+
+AllZerosDefaultGradients::AllZerosDefaultGradients(const ForwardOperation& op)
+    : outputs_(op.outputs) {
+  for (auto output : outputs_) {
+    output->Ref();
+  }
+}
+AbstractTensorHandle* AllZerosDefaultGradients::get(
+    Context* ctx, absl::Span<AbstractTensorHandle* const> grad_inputs, int i) {
+  if (grad_inputs[i]) {
+    return grad_inputs[i];
+  }
+  if (cached_default_grads_[i]) {
+    return cached_default_grads_[i].get();
+  }
+  AbstractTensorHandle* result = nullptr;
+  Status s = ZerosLike(ctx->ctx, outputs_[i], &result);
+  if (!s.ok()) {
+    if (result) {
+      result->Unref();
+    }
+    VLOG(1) << "Failed to create ZerosLike for index " << i;
+    return nullptr;
+  }
+  cached_default_grads_[i].reset(result);
+  return result;
+}
+
+PassThroughDefaultGradients::PassThroughDefaultGradients(
+    const ForwardOperation& op) {}
+AbstractTensorHandle* PassThroughDefaultGradients::get(
+    Context* ctx, absl::Span<AbstractTensorHandle* const> grad_inputs, int i) {
+  return grad_inputs[i];
+}
+
+Status GradientRegistry::Register(
+    const string& op_name, BackwardFunctionFactory backward_function_factory) {
   auto iter = registry_.find(op_name);
   if (iter != registry_.end()) {
     const string error_msg = "Gradient already exists for op: " + op_name + ".";
     return errors::AlreadyExists(error_msg);
   }
-  registry_.insert({op_name, factory});
+  registry_.insert({op_name, backward_function_factory});
   return Status::OK();
 }
 Status GradientRegistry::Lookup(
     const ForwardOperation& op,
-    std::unique_ptr<GradientFunction>* grad_fn) const {
+    std::unique_ptr<BackwardFunction>* backward_function) const {
   auto iter = registry_.find(op.op_name);
   if (iter == registry_.end()) {
     const string error_msg = "No gradient defined for op: " + op.op_name + ".";
     return errors::NotFound(error_msg);
   }
-  grad_fn->reset(iter->second(op));
+  backward_function->reset(iter->second(op));
   return Status::OK();
 }
 
@@ -92,33 +165,8 @@ AbstractTensorHandle* TapeTensor::OnesLike() const {
   }
   return outputs[0];
 }
-AbstractTensorHandle* TapeTensor::ZerosLike() const {
-  AbstractOperationPtr op(ctx_->CreateOperation());
-  // TODO(srbs): Consider adding a TF_RETURN_NULLPTR_IF_ERROR.
-  Status s = op->Reset("ZerosLike", /*raw_device_name=*/nullptr);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  if (isa<tracing::TracingOperation>(op.get())) {
-    s = dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
-        absl::StrCat("ZerosLike", ToId(handle_)).c_str());
-    if (!s.ok()) {
-      return nullptr;
-    }
-  }
-  s = op->AddInput(handle_);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  int num_outputs = 1;
-  // TODO(srbs): Figure out who is in charge of releasing this.
-  std::vector<AbstractTensorHandle*> outputs(num_outputs);
-  s = op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  return outputs[0];
-}
+
+AbstractTensorHandle* TapeTensor::ZerosLike() const { return nullptr; }
 
 // Returns the number of elements in the gradient tensor.
 int64 TapeVSpace::NumElements(AbstractTensorHandle* tensor) const {
@@ -159,13 +207,16 @@ AbstractTensorHandle* TapeVSpace::AggregateGradients(
 
 // Calls the passed-in backward function.
 Status TapeVSpace::CallBackwardFunction(
-    GradientFunction* backward_function,
+    BackwardFunction* backward_function,
     const std::vector<int64>& unneeded_gradients,
     gtl::ArraySlice<AbstractTensorHandle*> output_gradients,
     std::vector<AbstractTensorHandle*>* result) const {
   if (backward_function == nullptr) return Status::OK();
   Context ctx = {ctx_};
-  return backward_function->Compute(&ctx, output_gradients, result);
+  IncomingGradientsImpl incoming_gradients(
+      output_gradients, &ctx, backward_function->GetDefaultGradientFunction());
+  return backward_function->GetGradientFunction()->Compute(
+      &ctx, incoming_gradients, result);
 }
 
 // Looks up the ID of a Gradient.
@@ -373,15 +424,15 @@ Status Execute(AbstractOperation* op_, AbstractContext* ctx,
   }
   tape->RecordOperation(
       op_->Name(), tape_tensors, input_ids, input_dtypes,
-      [registry, forward_op_]() -> GradientFunction* {
-        std::unique_ptr<GradientFunction> grad_fn;
-        Status s = registry.Lookup(*forward_op_, &grad_fn);
+      [registry, forward_op_]() -> BackwardFunction* {
+        std::unique_ptr<BackwardFunction> backward_fn;
+        Status s = registry.Lookup(*forward_op_, &backward_fn);
         if (!s.ok()) {
           return nullptr;
         }
-        return grad_fn.release();
+        return backward_fn.release();
       },
-      [](GradientFunction* ptr) {
+      [](BackwardFunction* ptr) {
         if (ptr) {
           delete ptr;
         }
diff --git a/tensorflow/c/eager/gradients.h b/tensorflow/c/eager/gradients.h
index 267ee5b7ab2..04e11291404 100644
--- a/tensorflow/c/eager/gradients.h
+++ b/tensorflow/c/eager/gradients.h
@@ -55,18 +55,25 @@ struct Context {
  public:
   AbstractContext* ctx;
 };
+
+class IncomingGradients {
+ public:
+  virtual AbstractTensorHandle* operator[](int i) const = 0;
+  virtual size_t size() const = 0;
+  virtual ~IncomingGradients() {}
+};
+
 class GradientFunction {
  public:
   // TODO(srbs): How we support CompositeTensors e.g. IndexedSlices in
   // `grad_inputs`.
-  virtual Status Compute(Context* ctx,
-                         absl::Span<AbstractTensorHandle* const> grad_inputs,
+  virtual Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
                          std::vector<AbstractTensorHandle*>* grad_outputs) = 0;
   virtual ~GradientFunction() {}
 };
 
 // Metadata from the forward operation that is made available to the
-// gradient registerer to instantiate a GradientFunction.
+// gradient registerer to instantiate a BackwardFunction.
 struct ForwardOperation {
  public:
   string op_name;
@@ -76,18 +83,86 @@ struct ForwardOperation {
   AbstractContext* ctx;
 };
 
-using GradientFunctionFactory =
-    std::function<GradientFunction*(const ForwardOperation& op)>;
-
-// Map from op name to a `GradientFunctionFactory`.
-class GradientRegistry {
+// Interface for building default zeros gradients for op outputs which are
+// missing incoming gradients. Custom implementations of this can be used to
+// control which of the forward op's output tensors/their metadata needs to
+// be kept around in memory to build the default zeros grad.
+//
+// Some common helper implementations are provided below.
+class DefaultGradientFunction {
  public:
-  Status Register(const string& op, GradientFunctionFactory factory);
-  Status Lookup(const ForwardOperation& op,
-                std::unique_ptr<GradientFunction>* grad_fn) const;
+  virtual AbstractTensorHandle* get(
+      Context* ctx, absl::Span<AbstractTensorHandle* const> grad_inputs,
+      int i) = 0;
+  virtual ~DefaultGradientFunction() {}
+};
+
+// Returns zeros for any `nullptr` in `grad_inputs`.
+//
+// This may require keeping track of all of forward op's output
+// tensors and hence may incur a higher memory footprint. Use sparingly.
+//
+// Multiple calls to `AllZerosDefaultGradients::get` return the same tensor
+// handle.
+//
+// The destructor of this class `Unref`'s any cached tensor handles so users of
+// those tensor handles should `Ref` them in order to keep them alive if needed.
+class AllZerosDefaultGradients : public DefaultGradientFunction {
+ public:
+  explicit AllZerosDefaultGradients(const ForwardOperation& op);
+  AbstractTensorHandle* get(Context* ctx,
+                            absl::Span<AbstractTensorHandle* const> grad_inputs,
+                            int i) override;
 
  private:
-  absl::flat_hash_map<string, GradientFunctionFactory> registry_;
+  // TODO(srbs): We do not always need to keep the tensors around. In immediate
+  // execution mode we just need to store the shape and dtype. During tracing
+  // we may need to keep the tensor around if the shape is not full defined.
+  std::vector<AbstractTensorHandle*> outputs_;
+  std::vector<AbstractTensorHandlePtr> cached_default_grads_;
+};
+
+// Passes through `grad_inputs` as-is. The `GradientFunction`
+// will be expected to deal with nullptr in `grad_inputs` if any.
+class PassThroughDefaultGradients : public DefaultGradientFunction {
+ public:
+  explicit PassThroughDefaultGradients(const ForwardOperation& op);
+  AbstractTensorHandle* get(Context* ctx,
+                            absl::Span<AbstractTensorHandle* const> grad_inputs,
+                            int i) override;
+};
+
+// A `BackwardFunction` wraps a `GradientFunction` and a
+// `DefaultGradientFunction`. Both are owned by this class' instance.
+class BackwardFunction {
+ public:
+  BackwardFunction(GradientFunction* gradient_function,
+                   DefaultGradientFunction* default_gradients)
+      : gradient_function_(gradient_function),
+        default_gradients_(default_gradients) {}
+  GradientFunction* GetGradientFunction() { return gradient_function_.get(); }
+  DefaultGradientFunction* GetDefaultGradientFunction() {
+    return default_gradients_.get();
+  }
+
+ private:
+  std::unique_ptr<GradientFunction> gradient_function_;
+  std::unique_ptr<DefaultGradientFunction> default_gradients_;
+};
+
+using BackwardFunctionFactory =
+    std::function<BackwardFunction*(const ForwardOperation& op)>;
+
+// Map from op name to a `BackwardFunctionFactory`.
+class GradientRegistry {
+ public:
+  Status Register(const string& op,
+                  BackwardFunctionFactory backward_function_factory);
+  Status Lookup(const ForwardOperation& op,
+                std::unique_ptr<BackwardFunction>* backward_function) const;
+
+ private:
+  absl::flat_hash_map<string, BackwardFunctionFactory> registry_;
 };
 
 // Returns a unique id for the tensor which is used by the tape to build
@@ -106,9 +181,16 @@ int64 ToId(AbstractTensorHandle* t);
 // allow us to trace the data dependencies between operations and hence compute
 // gradients.
 //
-// This also implements `ZerosLike` and `OnesLike` to create the default
+// This also implements `OnesLike` to create the default
 // incoming gradients for tensors which do not already have an incoming
 // gradient.
+//
+// `ZerosLike` is not expected to be called and returns a nullptr. The creation
+// of default zeros grads is handled by the `DefaultGradientFunction` registered
+// for each op.
+// TODO(srbs): We need to define `ZerosLike` here to keep the compiler happy.
+// Figure out a way to avoid this.
+// TODO(srbs): Should ZerosLike check-fail instead of returning nullptr?
 class TapeTensor {
  public:
   TapeTensor(AbstractTensorHandle* handle, AbstractContext* ctx);
@@ -123,7 +205,7 @@ class TapeTensor {
 
  private:
   AbstractTensorHandle* handle_;
-  // The context where OnesLike and ZerosLike ops are to be created.
+  // The context where OnesLike ops are to be created.
   AbstractContext* ctx_;
 };
 
@@ -132,7 +214,7 @@ class TapeTensor {
 // gradient and for performing gradient aggregation.
 // See `tensorflow::eager::VSpace` for more details.
 class TapeVSpace
-    : public eager::VSpace<AbstractTensorHandle, GradientFunction, TapeTensor> {
+    : public eager::VSpace<AbstractTensorHandle, BackwardFunction, TapeTensor> {
  public:
   explicit TapeVSpace(AbstractContext* ctx) : ctx_(ctx) {}
   ~TapeVSpace() override {}
@@ -147,7 +229,7 @@ class TapeVSpace
 
   // Calls the passed-in backward function.
   Status CallBackwardFunction(
-      GradientFunction* backward_function,
+      BackwardFunction* backward_function,
       const std::vector<int64>& unneeded_gradients,
       gtl::ArraySlice<AbstractTensorHandle*> output_gradients,
       std::vector<AbstractTensorHandle*>* result) const override;
@@ -168,8 +250,14 @@ class TapeVSpace
 };
 
 // A tracing/immediate-execution agnostic tape.
+//
+// Gradient functions defined for this library support handling null incoming
+// gradients. `Tape::ComputeGradient` should be called with
+// `build_default_zeros_grads=false`. Calling with
+// `build_default_zeros_grads=true` (the default) is equivalent but just results
+// in extra work because `TapeTensor::ZerosLike` returns a `nullptr` anyway.
 using Tape = tensorflow::eager::GradientTape<AbstractTensorHandle,
-                                             GradientFunction, TapeTensor>;
+                                             BackwardFunction, TapeTensor>;
 
 }  // namespace gradients
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index 944b10c000b..80b1f157074 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/gradients/array_grad.h"
 #include "tensorflow/c/experimental/gradients/math_grad.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/tf_status_helper.h"
@@ -50,6 +51,7 @@ class CppGradients
 Status RegisterGradients(GradientRegistry* registry) {
   TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("IdentityN", IdentityNRegisterer));
   return Status::OK();
 }
 
@@ -94,6 +96,26 @@ Status Exp(AbstractContext* ctx, Tape* tape,
                  registry);
 }
 
+// Computes `IdentityN(inputs)` and records it on the tape.
+Status IdentityN(AbstractContext* ctx, Tape* tape,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs,
+                 const GradientRegistry& registry) {
+  AbstractOperationPtr identity_n_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(Reset(identity_n_op.get(), "IdentityN",
+                           /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(identity_n_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<TracingOperation>(identity_n_op.get())
+                           ->SetOpName("my_identity_n"));
+  }
+  TF_RETURN_IF_ERROR(AddInputList(identity_n_op.get(), inputs, &forward_op));
+  int num_retvals = outputs.size();
+  return Execute(identity_n_op.get(), ctx, outputs, &num_retvals, &forward_op,
+                 tape, registry);
+}
+
 // Computes
 // y = inputs[0] + inputs[1]
 // return grad(y, {inputs[0], inputs[1]})
@@ -116,7 +138,8 @@ Status AddGradModel(AbstractContext* ctx,
       vspace, /*target_tensor_ids=*/{ToId(add_outputs[0])},
       /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
       source_tensors_that_are_targets,
-      /*output_gradients=*/{}, &out_grads));
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
   for (auto add_output : add_outputs) {
     add_output->Unref();
   }
@@ -146,7 +169,8 @@ Status ExpGradModel(AbstractContext* ctx,
   TF_RETURN_IF_ERROR(tape->ComputeGradient(
       vspace, /*target_tensor_ids=*/{ToId(exp_outputs[0])},
       /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
-      /*output_gradients=*/{}, &out_grads));
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
   for (auto exp_output : exp_outputs) {
     exp_output->Unref();
   }
@@ -155,6 +179,41 @@ Status ExpGradModel(AbstractContext* ctx,
   return Status::OK();
 }
 
+// Computes
+// ignored, y = IdentityN(inputs[0], inputs[1])
+// return grad(y, {inputs[0], inputs[1]})
+// This should return [nullptr, 1].
+Status IdentityNGradModel(AbstractContext* ctx,
+                          absl::Span<AbstractTensorHandle* const> inputs,
+                          absl::Span<AbstractTensorHandle*> outputs,
+                          const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));
+  tape->Watch(ToId(inputs[1]));
+
+  vector<AbstractTensorHandle*> identity_n_outputs(2);
+  TF_RETURN_IF_ERROR(IdentityN(ctx, tape, inputs,
+                               absl::MakeSpan(identity_n_outputs), registry));
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(identity_n_outputs[1])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto identity_n_output : identity_n_outputs) {
+    identity_n_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
 AbstractContext* BuildFunction(const char* fn_name) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -389,13 +448,72 @@ TEST_P(CppGradients, TestExpGrad) {
   result_tensor = nullptr;
 }
 
-// TODO(b/160888630): Enable this test with mlir after AddInputList is
-// supported. It is needed for AddN op which is used for gradient aggregation.
+TEST_P(CppGradients, TestIdentityNGrad) {
+  // Pseudo-code:
+  //
+  // tape.watch(x1)
+  // tape.watch(x2)
+  // unused, y = IdentityN([x1, x2])
+  // outputs = tape.gradient(y, [x1, x2])
+  // Expected: [nullptr, 1]
+  //
+  // This test is interesting because the current implementation of GradientTape
+  // would return [0, 1] whereas we use build_default_zeros_grads=false here
+  // so we get back [nullptr, 1].
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x1;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x1.reset(x_raw);
+  }
+  AbstractTensorHandlePtr x2;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x2.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(IdentityNGradModel, ctx.get(), {x1.get(), x2.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  EXPECT_EQ(outputs[0], nullptr);
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[1], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, 1.0);
+  outputs[1]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
+// TODO(b/164171226): Enable this test with tfrt after AddInputList is
+// supported. It is needed for IdentityN.
 #ifdef PLATFORM_GOOGLE
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
     ::testing::Combine(::testing::Values("graphdef", "mlir"),
-                       /*tfrt*/ ::testing::Values(true, false),
+                       /*tfrt*/ ::testing::Values(false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index 6d06d9a8de6..02a3320ef65 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -57,15 +57,10 @@ class ImmediateExecutionContext : public AbstractContext {
 
   // Create a tensor instance from the given data buffer and description.
   // `memory_releaser` will be called on destruction, and it's responsible for
-  // cleaning up the underlying buffer. `convert_string` indicates whether it
-  // has to handle tstring conversion. Expected to be removed once tstring
-  // migration is done.
-  virtual AbstractTensorInterface* CreateTensor(DataType dtype,
-                                                const int64_t* dims,
-                                                int num_dims, void* data,
-                                                size_t len, bool convert_string,
-                                                MemoryReleaser memory_releaser,
-                                                void* memory_releaser_arg) = 0;
+  // cleaning up the underlying buffer.
+  virtual AbstractTensorInterface* CreateTensor(
+      DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
+      MemoryReleaser memory_releaser, void* memory_releaser_arg) = 0;
 
   // Create a handle to wrap and manage a Tensor
   virtual ImmediateExecutionTensorHandle* CreateLocalHandle(
diff --git a/tensorflow/c/eager/mnist_gradients_test.cc b/tensorflow/c/eager/mnist_gradients_test.cc
new file mode 100644
index 00000000000..1f8ad138858
--- /dev/null
+++ b/tensorflow/c/eager/mnist_gradients_test.cc
@@ -0,0 +1,781 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+namespace {
+
+class CppGradients
+    : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
+ protected:
+  void SetUp() override {
+    TF_SetTracingImplementation(std::get<0>(GetParam()));
+  }
+};
+
+Status RegisterGradients(GradientRegistry* registry) {
+  TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("MatMul", MatMulRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Relu", ReluRegisterer));
+  TF_RETURN_IF_ERROR(
+      registry->Register("SparseSoftmaxCrossEntropyWithLogits",
+                         SparseSoftmaxCrossEntropyLossRegisterer));
+  return Status::OK();
+}
+
+// ========================= Test Util Functions ==============================
+
+// Get a scalar TensorHandle with given value
+Status TestScalarTensorHandle(AbstractContext* ctx, float value,
+                              AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, value);
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return Status::OK();
+}
+
+// Get a Matrix TensorHandle with given float values and dimensions
+Status TestTensorHandleWithDimsFloat(AbstractContext* ctx, float data[],
+                                     int64_t dims[], int num_dims,
+                                     AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager =
+      TestTensorHandleWithDimsFloat(eager_ctx, data, dims, num_dims);
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return Status::OK();
+}
+
+// Get a Matrix TensorHandle with given int values and dimensions
+Status TestTensorHandleWithDimsInt(AbstractContext* ctx, int data[],
+                                   int64_t dims[], int num_dims,
+                                   AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager =
+      TestTensorHandleWithDimsInt(eager_ctx, data, dims, num_dims);
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return Status::OK();
+}
+
+Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_TensorHandle* result_t =
+      TF_AbstractTensorGetEagerTensor(wrap(t), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  *result_tensor = TFE_TensorHandleResolve(result_t, status.get());
+  return Status::OK();
+}
+
+AbstractTensorHandlePtr GetTensorHandleUtilFloat(AbstractContext* ctx,
+                                                 float vals[], int64_t dims[],
+                                                 int num_dims) {
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TestTensorHandleWithDimsFloat(ctx, vals, dims, num_dims, &a_raw);
+  A.reset(a_raw);
+  return A;
+}
+
+AbstractTensorHandlePtr GetTensorHandleUtilInt(AbstractContext* ctx, int vals[],
+                                               int64_t dims[], int num_dims) {
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TestTensorHandleWithDimsInt(ctx, vals, dims, num_dims, &a_raw);
+  A.reset(a_raw);
+  return A;
+}
+
+// =========================== Start Tests ================================
+
+TEST_P(CppGradients, TestMatMulGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  float A_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t A_dims[] = {2, 2};
+  float B_vals[] = {.5f, -1.0f, 1.0f, 1.0f};
+  int64_t B_dims[] = {2, 2};
+  int num_dims = 2;
+
+  AbstractTensorHandlePtr A =
+      GetTensorHandleUtilFloat(ctx.get(), A_vals, A_dims, num_dims);
+  AbstractTensorHandlePtr B =
+      GetTensorHandleUtilFloat(ctx.get(), B_vals, B_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   * tape.watch(A)
+   * tape.watch(B)
+   * Y = AB
+   * outputs = tape.gradient(Y, [A, B])
+   */
+
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(MatMulGradModel, ctx.get(), {A.get(), B.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dA_tensor;
+  s = GetValue(outputs[0], &dA_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(dA_tensor),
+         TF_TensorByteSize(dA_tensor));
+
+  float expected_dA[4] = {-.5f, 2.0f, -.5f, 2.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dA[j], tolerance);
+  }
+
+  TF_Tensor* dB_tensor;
+  s = GetValue(outputs[1], &dB_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(dB_tensor),
+         TF_TensorByteSize(dB_tensor));
+
+  float expected_dB[4] = {4.0f, 4.0f, 6.0f, 6.0f};
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dB[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  TF_DeleteTensor(dA_tensor);
+  TF_DeleteTensor(dB_tensor);
+}
+
+TEST_P(CppGradients, TestMNISTForward) {
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t dims[] = {2, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {-1.0f, 10.0f, .5f, 1.0f};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1};
+  int64_t dims_y[] = {2};
+  num_dims = sizeof(dims_y) / sizeof(dims_y[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, dims, num_dims);
+
+  GradientRegistry registry;
+
+  // Run the Forward Pass
+  std::vector<AbstractTensorHandle*> outputs(2);
+  Status s =
+      RunModel(MNISTForwardModel, ctx.get(),
+               {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Verify the Results
+  TF_Tensor* scores_tensor;
+  s = GetValue(outputs[0], &scores_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(scores_tensor),
+         TF_TensorByteSize(scores_tensor));
+
+  float expected_scores[4] = {3.6f, -6.0f, 10.2f, -17.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_scores[j], tolerance);
+  }
+
+  TF_Tensor* loss_vals_tensor;
+  s = GetValue(outputs[1], &loss_vals_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(loss_vals_tensor),
+         TF_TensorByteSize(loss_vals_tensor));
+  float expected_losses[2] = {9.6f, 27.2f};
+  for (int j = 0; j < 2; j++) {
+    ASSERT_NEAR(result_data[j], expected_losses[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  TF_DeleteTensor(scores_tensor);
+  TF_DeleteTensor(loss_vals_tensor);
+}
+
+TEST_P(CppGradients, TestMNISTForward2) {
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  int64_t X_dims[] = {3, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {-1.0f, 10.0f, .5f, 1.0f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1, 1};
+  int64_t y_dims[] = {3};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  GradientRegistry registry;
+
+  // Run the Forward Pass
+  std::vector<AbstractTensorHandle*> outputs(2);
+  Status s =
+      RunModel(MNISTForwardModel, ctx.get(),
+               {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Verify the Results
+  TF_Tensor* scores_tensor;
+  s = GetValue(outputs[0], &scores_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[6] = {0};
+  memcpy(&result_data[0], TF_TensorData(scores_tensor),
+         TF_TensorByteSize(scores_tensor));
+
+  float expected_scores[6] = {3.6f, -6.0f, 10.2f, -17.0f, 16.8f, -28.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 6; j++) {
+    ASSERT_NEAR(result_data[j], expected_scores[j], tolerance);
+  }
+
+  TF_Tensor* loss_vals_tensor;
+  s = GetValue(outputs[1], &loss_vals_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(loss_vals_tensor),
+         TF_TensorByteSize(loss_vals_tensor));
+  float expected_losses[3] = {9.6f, 27.2f, 44.8f};
+  for (int j = 0; j < 3; j++) {
+    ASSERT_NEAR(result_data[j], expected_losses[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  TF_DeleteTensor(scores_tensor);
+  TF_DeleteTensor(loss_vals_tensor);
+}
+
+TEST_P(CppGradients, TestMatMulTranspose) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  int64_t X_dims[] = {2, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  GradientRegistry registry;
+
+  // Run the MatMul Op
+  std::vector<AbstractTensorHandle*> outputs(1);
+
+  Status s = RunModel(MatMulTransposeModel, ctx.get(), {X.get(), W1.get()},
+                      absl::MakeSpan(outputs),
+                      /*use_function=*/!std::get<2>(GetParam()), registry);
+
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Verify the Results
+  TF_Tensor* scores_tensor;
+  s = GetValue(outputs[0], &scores_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[6] = {0};
+  memcpy(&result_data[0], TF_TensorData(scores_tensor),
+         TF_TensorByteSize(scores_tensor));
+
+  float expected_scores[6] = {13.0f, 18.0f, 17.0f, 24.0f, 21.0f, 30.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 6; j++) {
+    ASSERT_NEAR(result_data[j], expected_scores[j], tolerance);
+  }
+}
+
+TEST_P(CppGradients, TestReluGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, -1.0f};
+  int64_t X_dims[] = {3, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   * tape.watch(X)
+   * Y = Relu(X)
+   * outputs = tape.gradient(Y, [X])
+   */
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(ReluGradModel, ctx.get(), {X.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = GetValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[9] = {0};
+  memcpy(&result_data[0], TF_TensorData(dX_tensor),
+         TF_TensorByteSize(dX_tensor));
+
+  float expected_dX[9] = {1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 9; j++) {
+    ASSERT_NEAR(result_data[j], expected_dX[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  TF_DeleteTensor(dX_tensor);
+}
+
+TEST_P(CppGradients, TestSoftmaxLossGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = scores
+  float X_vals[] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, -1.0f};
+  int64_t X_dims[] = {3, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 0, 1};
+  int64_t y_dims[] = {3};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   * tape.watch(X)
+   * tape.watch(labels)
+   * loss = SoftmaxLoss(X, labels)
+   * outputs = tape.gradient(loss, [X, labels])
+   *
+   *
+   */
+
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(SoftmaxLossGradModel, ctx.get(), {X.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = GetValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[9] = {0};
+  memcpy(&result_data[0], TF_TensorData(dX_tensor),
+         TF_TensorByteSize(dX_tensor));
+
+  float expected_dX[9] = {0.090f,  -0.7553f, 0.6652f,  -0.9099f, 0.2447f,
+                          0.6652f, 0.8437f,  -0.8858f, 0.0420f};
+  float tolerance = 1e-3;
+  for (int j = 0; j < 9; j++) {
+    ASSERT_NEAR(result_data[j], expected_dX[j], tolerance);
+  }
+
+  // Only Unref() first output as 2nd is nullptr grad for labels
+  outputs[0]->Unref();
+  TF_DeleteTensor(dX_tensor);
+}
+
+TEST_P(CppGradients, TestMNISTGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t X_dims[] = {2, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // W1 = first weights
+  float W1_vals[] = {-1.0f, 10.0f, .5f, 1.0f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1};
+  int64_t y_dims[] = {2};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  // Register Grads
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  /* Pseudo-code:
+   *
+   *
+   * tape.watch(W1)
+   * tape.watch(W2)
+   * mm = X*W1
+   * hidden = Relu(mm)
+   * scores = W2*hidden
+   * loss = SoftmaxLoss(scores, y)
+   * outputs = tape.gradient(loss, [A, B])
+   *
+   */
+
+  std::vector<AbstractTensorHandle*> outputs(3);
+  s = RunModel(MNISTGradModel, ctx.get(),
+               {X.get(), W1.get(), W2.get(), y.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float tolerance = 1e-3;
+  TF_Tensor* dW1_tensor;
+  s = GetValue(outputs[0], &dW1_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(dW1_tensor),
+         TF_TensorByteSize(dW1_tensor));
+
+  float expected_dW1[4] = {0.0f, 3.2f, 0.0f, 4.8f};
+  ;  // dLoss
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dW1[j], tolerance);
+  }
+
+  TF_Tensor* dW2_tensor;
+  s = GetValue(outputs[1], &dW2_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  memcpy(&result_data[0], TF_TensorData(dW2_tensor),
+         TF_TensorByteSize(dW2_tensor));
+
+  float expected_dW2[4] = {0.0f, 0.0f, 46.0f, -46.0f};  // dLoss
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], expected_dW2[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  outputs[1]->Unref();
+  outputs[2]->Unref();
+  TF_DeleteTensor(dW1_tensor);
+  TF_DeleteTensor(dW2_tensor);
+}
+
+TEST_P(CppGradients, TestScalarMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr eta;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.5f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    eta.reset(x_raw);
+  }
+
+  float A_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t A_dims[] = {2, 2};
+  int num_dims = 2;
+
+  AbstractTensorHandlePtr A =
+      GetTensorHandleUtilFloat(ctx.get(), A_vals, A_dims, num_dims);
+
+  GradientRegistry registry;
+  std::vector<AbstractTensorHandle*> outputs(1);
+  Status s = RunModel(ScalarMulModel, ctx.get(), {eta.get(), A.get()},
+                      absl::MakeSpan(outputs),
+                      /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dA_tensor;
+  s = GetValue(outputs[0], &dA_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(dA_tensor),
+         TF_TensorByteSize(dA_tensor));
+
+  float tolerance = 1e-3;
+  float eta_val = 1.5f;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(result_data[j], eta_val * A_vals[j], tolerance);
+  }
+
+  outputs[0]->Unref();
+  TF_DeleteTensor(dA_tensor);
+}
+
+TEST_P(CppGradients, TestMNIST_Training) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = data
+  float X_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t X_dims[] = {2, 2};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // TODO(amturati): use random initializer for weights instead of
+  // constant values.
+
+  // W1 = first weights
+  float W1_vals[] = {-.01f, 0.4f, 0.5f, -.2f};
+  int64_t dims[] = {2, 2};
+  AbstractTensorHandlePtr W1 =
+      GetTensorHandleUtilFloat(ctx.get(), W1_vals, dims, num_dims);
+
+  // W2 = second weights
+  float W2_vals[] = {.1f, .2f, .3f, -.5f};
+  AbstractTensorHandlePtr W2 =
+      GetTensorHandleUtilFloat(ctx.get(), W2_vals, dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 1};
+  int64_t y_dims[] = {2};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  // Register Grads
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Prepare for training
+  std::vector<AbstractTensorHandle*> weights;
+  weights.push_back(W1.get());
+  weights.push_back(W2.get());
+
+  // Set learning rate to be 1e-1
+  AbstractTensorHandle* learning_rate = nullptr;
+  s = TestScalarTensorHandle(ctx.get(), 1e-1, &learning_rate);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Train
+  int num_iters = 10;
+  std::vector<AbstractTensorHandle*> mnist_outputs(3);
+  std::vector<AbstractTensorHandle*> grads(2);
+  for (int i = 0; i < num_iters; i++) {
+    // Run Forward Pass
+    s = RunModel(MNISTGradModel, ctx.get(),
+                 {X.get(), weights[0], weights[1], y.get()},
+                 absl::MakeSpan(mnist_outputs),
+                 /*use_function=*/!std::get<2>(GetParam()), registry);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+    // Fill grads
+    grads[0] = mnist_outputs[0];
+    grads[1] = mnist_outputs[1];
+
+    // Gradient Update
+    s = UpdateWeights(ctx.get(), grads, weights, learning_rate);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  }
+
+  grads[0]->Unref();          // release W1_grad
+  grads[1]->Unref();          // release W2_grad
+  mnist_outputs[2]->Unref();  // release loss
+}
+
+#ifdef PLATFORM_GOOGLE
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, CppGradients,
+    ::testing::Combine(::testing::Values("graphdef"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, CppGradients,
+    ::testing::Combine(::testing::Values("graphdef"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#endif
+}  // namespace
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/c/eager/mnist_gradients_testutil.cc b/tensorflow/c/eager/mnist_gradients_testutil.cc
new file mode 100644
index 00000000000..4b2c87c678d
--- /dev/null
+++ b/tensorflow/c/eager/mnist_gradients_testutil.cc
@@ -0,0 +1,594 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+
+using std::vector;
+using tracing::TracingOperation;
+
+// ========================== Tape Ops ==============================
+
+// Computes `inputs[0] + inputs[1]` and records it on the tape.
+Status Add(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs,
+           const GradientRegistry& registry) {
+  AbstractOperationPtr add_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(add_op.get(), "Add", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(add_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(add_op.get())->SetOpName("my_add"));
+  }
+  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[0], &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[1], &forward_op));
+  int num_retvals = 1;
+  return Execute(add_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `inputs[0] * inputs[1]` for matrices and records it on the tape.
+Status MatMul(AbstractContext* ctx, Tape* tape,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a, bool transpose_b,
+              const GradientRegistry& registry) {
+  AbstractOperationPtr matmul_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(Reset(matmul_op.get(), "MatMul",
+                           /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(matmul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(matmul_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(AddInput(matmul_op.get(), inputs[0], &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(matmul_op.get(), inputs[1], &forward_op));
+  TF_RETURN_IF_ERROR(tensorflow::gradients::internal::SetAttrBool(
+      matmul_op.get(), "transpose_a", transpose_a, &forward_op));
+  TF_RETURN_IF_ERROR(tensorflow::gradients::internal::SetAttrBool(
+      matmul_op.get(), "transpose_b", transpose_b, &forward_op));
+
+  int num_retvals = 1;
+  return Execute(matmul_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+Status Mul(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name,
+           const GradientRegistry& registry) {
+  AbstractOperationPtr mul_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(mul_op.get(), "Mul", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(mul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(mul_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(AddInput(mul_op.get(), inputs[0], &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(mul_op.get(), inputs[1], &forward_op));
+
+  int num_retvals = 1;
+  return Execute(mul_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `Relu(inputs[0])` and records it on the tape.
+Status Relu(AbstractContext* ctx, Tape* tape,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name,
+            const GradientRegistry& registry) {
+  AbstractOperationPtr relu_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(
+      Reset(relu_op.get(), "Relu", /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(relu_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(relu_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(AddInput(relu_op.get(), inputs[0], &forward_op));
+  int num_retvals = 1;
+  return Execute(relu_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+// Computes `SoftmaxLoss(scores, labels)` for matrices and records it on the
+// tape.
+Status SparseSoftmaxCrossEntropyLoss(
+    AbstractContext* ctx, Tape* tape,
+    absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name,
+    const GradientRegistry& registry) {
+  AbstractTensorHandle* scores = inputs[0];
+  AbstractTensorHandle* labels = inputs[1];
+
+  AbstractOperationPtr sm_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  forward_op.ctx = ctx;
+  TF_RETURN_IF_ERROR(Reset(sm_op.get(), "SparseSoftmaxCrossEntropyWithLogits",
+                           /*raw_device_name=*/nullptr, &forward_op));
+  if (isa<TracingOperation>(sm_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(sm_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(AddInput(sm_op.get(), scores, &forward_op));
+  TF_RETURN_IF_ERROR(AddInput(sm_op.get(), labels, &forward_op));
+
+  int num_retvals = 2;  // returns loss values and backprop
+  return Execute(sm_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
+                 registry);
+}
+
+//===================== Test Models to run =========================
+
+// Computes
+// y = inputs[0] + inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status AddGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  std::vector<AbstractTensorHandle*> add_outputs(1);
+  TF_RETURN_IF_ERROR(Add(ctx, tape, inputs, absl::MakeSpan(add_outputs),
+                         registry));  // Compute x+y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(add_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto add_output : add_outputs) {
+    add_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
+// Computes
+// y = inputs[0] * inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status MatMulGradModel(AbstractContext* ctx,
+                       absl::Span<AbstractTensorHandle* const> inputs,
+                       absl::Span<AbstractTensorHandle*> outputs,
+                       const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  vector<AbstractTensorHandle*> mm_outputs(1);
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, inputs, absl::MakeSpan(mm_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute x*y.
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(mm_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto mm_output : mm_outputs) {
+    mm_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
+// Model to run 2-layer net
+Status MNISTForwardModel(AbstractContext* ctx,
+                         absl::Span<AbstractTensorHandle* const> inputs,
+                         absl::Span<AbstractTensorHandle*> outputs,
+                         const GradientRegistry& registry) {
+  /**
+   * We will trace a 2-layer fully connected network for an MNIST model:
+   *
+   *   def mnist_forward(X, W1, W2, y_labels):
+   *     mm_out_1 = tf.matmul(X,W1)
+   *     hidden_layer = tf.nn.relu(mm_out_1)
+   *     scores = tf.matmul(hidden_layer,W2)
+   *     softmax =
+   * tf.nn.sparse_softmax_cross_entropy_with_logits(scores,y_labels) return
+   * scores, softmax
+   *
+   * Use this convention for inputs:
+   *
+   *   inputs = [X, W1, W2, y_labels]
+   *
+   */
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+  AbstractTensorHandle* W2 = inputs[2];
+  AbstractTensorHandle* y_labels = inputs[3];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(W1));  // Watch W1.
+  tape->Watch(ToId(W2));  // Watch W2.
+  vector<AbstractTensorHandle*> temp_outputs(1);
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, {temp_outputs[0]},
+                          absl::MakeSpan(temp_outputs), "relu",
+                          registry));  // Compute Relu(X*W1)
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {temp_outputs[0], W2},
+                            absl::MakeSpan(temp_outputs), "matmul1",
+                            /*transpose_a=*/false, /*transpose_b=*/false,
+                            registry));  // Compute W2*Relu(X*W1)
+
+  AbstractTensorHandle* scores = temp_outputs[0];
+
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(
+      ctx, tape, {scores, y_labels}, absl::MakeSpan(temp_outputs),
+      "softmax_loss", registry));  // Compute Softmax(Scores,labels)
+
+  AbstractTensorHandle* loss_vals = temp_outputs[0];
+
+  outputs[0] = scores;
+  outputs[1] = loss_vals;
+  delete tape;
+  return Status::OK();
+}
+
+Status MatMulTransposeModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry) {
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(X));
+  tape->Watch(ToId(W1));
+  vector<AbstractTensorHandle*> temp_outputs(1);
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/true,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  outputs[0] = temp_outputs[0];
+
+  delete tape;
+  return Status::OK();
+}
+
+Status ReluGradModel(AbstractContext* ctx,
+                     absl::Span<AbstractTensorHandle* const> inputs,
+                     absl::Span<AbstractTensorHandle*> outputs,
+                     const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch X
+  vector<AbstractTensorHandle*> relu_outputs(1);
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, inputs, absl::MakeSpan(relu_outputs),
+                          "relu0", registry));  // Relu(X)
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(relu_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+
+  for (auto relu_output : relu_outputs) {
+    relu_output->Unref();
+  }
+
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
+Status SoftmaxLossGradModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch scores.
+  tape->Watch(ToId(inputs[1]));  // Watch labels.
+  vector<AbstractTensorHandle*> sm_outputs(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(
+      ctx, tape, inputs, absl::MakeSpan(sm_outputs), "softmax0", registry));
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(sm_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
+Status MNISTGradModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry) {
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+  AbstractTensorHandle* W2 = inputs[2];
+  AbstractTensorHandle* y_labels = inputs[3];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/true);
+  tape->Watch(ToId(X));   // Watch X.
+  tape->Watch(ToId(W1));  // Watch W1.
+  tape->Watch(ToId(W2));  // Watch W1.
+  vector<AbstractTensorHandle*> temp_outputs(1);
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
+                            "matmul0", /*transpose_a=*/false,
+                            /*transpose_b=*/false, registry));  // Compute X*W1
+
+  AbstractTensorHandle* mm = temp_outputs[0];
+
+  TF_RETURN_IF_ERROR(Relu(ctx, tape, {mm},
+                          absl::MakeSpan(temp_outputs),  // Relu(X*W1)
+                          "relu0", registry));
+
+  AbstractTensorHandle* hidden = temp_outputs[0];
+
+  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {hidden, W2},
+                            absl::MakeSpan(temp_outputs), "matmul1",
+                            /*transpose_a=*/false, /*transpose_b=*/false,
+                            registry));  // W2*Relu(X*W1)
+
+  AbstractTensorHandle* scores = temp_outputs[0];
+
+  temp_outputs.resize(2);
+  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(
+      ctx, tape, {scores, y_labels}, absl::MakeSpan(temp_outputs),
+      "softmaxloss", registry));  // W2*Relu(X*W1)
+
+  AbstractTensorHandle* loss = temp_outputs[0];
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(
+      tape->ComputeGradient(vspace, /*target_tensor_ids=*/{ToId(loss)},
+                            /*source_tensor_ids=*/{ToId(W1), ToId(W2)},
+                            source_tensors_that_are_targets,
+                            /*output_gradients=*/{}, &out_grads,
+                            /*build_default_zeros_grads=*/false));
+
+  // Only release 2nd temp output as first holds loss values.
+  temp_outputs[1]->Unref();
+
+  outputs[0] = out_grads[0];  // dW1
+  outputs[1] = out_grads[1];  // dW2
+  outputs[2] = loss;
+
+  delete tape;
+  return Status::OK();
+}
+
+Status ScalarMulModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry) {
+  AbstractTensorHandle* eta = inputs[0];
+  AbstractTensorHandle* A = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  vector<AbstractTensorHandle*> temp_outputs(1);
+
+  TF_RETURN_IF_ERROR(Mul(ctx, tape, {eta, A}, absl::MakeSpan(temp_outputs),
+                         "scalarMul0", registry));  // Compute eta*A
+
+  outputs[0] = temp_outputs[0];
+
+  delete tape;
+  return Status::OK();
+}
+
+// ============================= End Models ================================
+
+Status UpdateWeights(AbstractContext* ctx, vector<AbstractTensorHandle*>& grads,
+                     vector<AbstractTensorHandle*>& weights,
+                     AbstractTensorHandle* learning_rate) {
+  /* Update weights one by one using gradient update rule:
+   *
+   *    w -= lr*grad[w]
+   *
+   *  NOTE: assuming learning rate is positive
+   */
+
+  Status s;
+  int num_grads = grads.size();
+  vector<AbstractTensorHandle*> temp_outputs(1);
+  std::string update_str;
+
+  // Negate learning rate for gradient descent
+  TF_RETURN_IF_ERROR(ops::Neg(ctx, {learning_rate},
+                              absl::MakeSpan(temp_outputs),
+                              "neg_lr"));  // Compute -lr
+  learning_rate = temp_outputs[0];
+
+  for (int i = 0; i < num_grads; i++) {
+    // Compute dW = -lr * grad(w[i])
+    update_str = "update_mul_" + std::to_string(i);
+    s = ops::Mul(ctx, {learning_rate, grads[i]}, absl::MakeSpan(temp_outputs),
+                 update_str.c_str());
+
+    AbstractTensorHandle* dW = temp_outputs[0];
+
+    // Compute temp = weights[i] + dW
+    update_str = "update_add_" + std::to_string(i);
+    s = ops::Add(ctx, {weights[i], dW}, absl::MakeSpan(temp_outputs),
+                 update_str.c_str());
+
+    // Update the weights
+    weights[i] = temp_outputs[0];
+  }
+
+  return Status::OK();
+}
+
+AbstractContext* BuildFunction(const char* fn_name) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name, status.get());
+  return unwrap(graph_ctx);
+}
+
+Status CreateParamsForInputs(AbstractContext* ctx,
+                             absl::Span<AbstractTensorHandle* const> inputs,
+                             vector<AbstractTensorHandle*>* params) {
+  tracing::TracingTensorHandle* handle = nullptr;
+  for (auto input : inputs) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(ctx)->AddParameter(
+        input->DataType(), &handle));
+    params->emplace_back(handle);
+  }
+  return Status::OK();
+}
+
+Status RunModel(Model model, AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                const GradientRegistry& registry) {
+  if (use_function) {
+    const char* fn_name = "test_fn";
+    std::unique_ptr<AbstractFunction> scoped_func;
+    // Returning null tensors from a tf.function is not supported, so we keep
+    // track of indices in the model's outputs are nullptr in this set.
+    // The FunctionDef only outputs the non-null tensors. We later pad the
+    // function op outputs to have nullptrs at the `null_indices`.
+    absl::flat_hash_set<int> null_indices;
+    {
+      AbstractContextPtr func_ctx(BuildFunction(fn_name));
+      vector<AbstractTensorHandle*> func_inputs;
+      func_inputs.reserve(inputs.size());
+      TF_RETURN_IF_ERROR(
+          CreateParamsForInputs(func_ctx.get(), inputs, &func_inputs));
+      vector<AbstractTensorHandle*> model_outputs;
+      model_outputs.resize(outputs.size());
+      TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
+                               absl::MakeSpan(model_outputs), registry));
+      for (auto func_input : func_inputs) {
+        func_input->Unref();
+      }
+      AbstractFunction* func = nullptr;
+      OutputList output_list;
+      output_list.expected_num_outputs = 0;
+      output_list.outputs.reserve(outputs.size());
+      for (int i = 0; i < model_outputs.size(); i++) {
+        if (model_outputs[i]) {
+          output_list.outputs.emplace_back(model_outputs[i]);
+          output_list.expected_num_outputs += 1;
+        } else {
+          null_indices.insert(i);
+        }
+      }
+      TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
+                             ->Finalize(&output_list, &func));
+      scoped_func.reset(func);
+      for (auto output : output_list.outputs) {
+        output->Unref();
+      }
+      TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
+    }
+
+    AbstractOperationPtr fn_op(ctx->CreateOperation());
+    TF_RETURN_IF_ERROR(fn_op->Reset(fn_name, /*raw_device_name=*/nullptr));
+    for (auto input : inputs) {
+      TF_RETURN_IF_ERROR(fn_op->AddInput(input));
+    }
+    int retvals = outputs.size() - null_indices.size();
+    vector<AbstractTensorHandle*> fn_outputs(retvals);
+    TF_RETURN_IF_ERROR(fn_op->Execute(
+        absl::Span<AbstractTensorHandle*>(fn_outputs.data(), fn_outputs.size()),
+        &retvals));
+    int skipped_indices = 0;
+    for (int i = 0; i < outputs.size(); i++) {
+      if (!null_indices.contains(i)) {
+        outputs[i] = fn_outputs[i - skipped_indices];
+      } else {
+        skipped_indices += 1;
+      }
+    }
+    TF_RETURN_IF_ERROR(ctx->RemoveFunction(fn_name));
+    return Status::OK();
+  } else {
+    return model(ctx, inputs, outputs, registry);
+  }
+}
+
+Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+  *ctx = unwrap(TF_NewEagerExecutionContext(opts, status.get()));
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_DeleteContextOptions(opts);
+  return Status::OK();
+}
diff --git a/tensorflow/c/eager/mnist_gradients_testutil.h b/tensorflow/c/eager/mnist_gradients_testutil.h
new file mode 100644
index 00000000000..b6de8ff6788
--- /dev/null
+++ b/tensorflow/c/eager/mnist_gradients_testutil.h
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+
+using namespace tensorflow;
+using namespace tensorflow::gradients;
+using namespace tensorflow::gradients::internal;
+
+// ========================== Tape Ops ==============================
+
+// Computes `inputs[0] + inputs[1]` and records it on the tape.
+Status Add(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs,
+           const GradientRegistry& registry);
+
+// Computes `inputs[0] * inputs[1]` for matrices and records it on the tape.
+Status MatMul(AbstractContext* ctx, Tape* tape,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a, bool transpose_b,
+              const GradientRegistry& registry);
+
+// Computes `inputs[0] * inputs[1]` and records it on the tape.
+Status Mul(AbstractContext* ctx, Tape* tape,
+           absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name,
+           const GradientRegistry& registry);
+
+// Computes `Relu(inputs[0])` and records it on the tape.
+Status Relu(AbstractContext* ctx, Tape* tape,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name,
+            const GradientRegistry& registry);
+
+// Computes `SoftmaxLoss(scores, labels)` for matrices and records it on the
+// tape.
+Status SparseSoftmaxCrossEntropyLoss(
+    AbstractContext* ctx, Tape* tape,
+    absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name,
+    const GradientRegistry& registry);
+
+// ====================== End Tape Ops ============================
+
+// Computes
+// y = inputs[0] + inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status AddGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry);
+
+// Computes
+// y = inputs[0] * inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status MatMulGradModel(AbstractContext* ctx,
+                       absl::Span<AbstractTensorHandle* const> inputs,
+                       absl::Span<AbstractTensorHandle*> outputs,
+                       const GradientRegistry& registry);
+
+// Computes 2-layer Neural Network with Softmax Loss.
+Status MNISTForwardModel(AbstractContext* ctx,
+                         absl::Span<AbstractTensorHandle* const> inputs,
+                         absl::Span<AbstractTensorHandle*> outputs,
+                         const GradientRegistry& registry);
+
+// Computes MatMul with first matrix tranposed.
+Status MatMulTransposeModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry);
+
+// Test Model to verify ReluGrad functionality
+Status ReluGradModel(AbstractContext* ctx,
+                     absl::Span<AbstractTensorHandle* const> inputs,
+                     absl::Span<AbstractTensorHandle*> outputs,
+                     const GradientRegistry& registry);
+
+// Test Model to verify SoftmaxGrad functionality
+Status SoftmaxLossGradModel(AbstractContext* ctx,
+                            absl::Span<AbstractTensorHandle* const> inputs,
+                            absl::Span<AbstractTensorHandle*> outputs,
+                            const GradientRegistry& registry);
+
+// Test Model to verify Multi-grad functionality for MNIST
+Status MNISTGradModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry);
+
+// Test Model to verify scalar-tensor multiplication Op
+Status ScalarMulModel(AbstractContext* ctx,
+                      absl::Span<AbstractTensorHandle* const> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      const GradientRegistry& registry);
+
+// Updates the weights for a neural network given incoming grads and learning
+// rate
+Status UpdateWeights(AbstractContext* ctx,
+                     std::vector<AbstractTensorHandle*>& grads,
+                     std::vector<AbstractTensorHandle*>& weights,
+                     AbstractTensorHandle* learning_rate);
+
+AbstractContext* BuildFunction(const char* fn_name);
+
+Status CreateParamsForInputs(AbstractContext* ctx,
+                             absl::Span<AbstractTensorHandle* const> inputs,
+                             std::vector<AbstractTensorHandle*>* params);
+
+using Model = std::function<Status(
+    AbstractContext*, absl::Span<AbstractTensorHandle* const>,
+    absl::Span<AbstractTensorHandle*>, const GradientRegistry&)>;
+
+Status RunModel(Model model, AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                const GradientRegistry& registry);
+
+Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx);
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 0d0e5ffce10..df5504adce2 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -76,10 +76,26 @@ cc_library(
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
     ],
 )
 
+tf_cc_test(
+    name = "parallel_device_lib_test",
+    srcs = ["parallel_device_lib_test.cc"],
+    deps = [
+        ":parallel_device_lib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "parallel_device_testlib",
     testonly = 1,
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index 768f686bd88..e270bfcbb80 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 
+#include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -118,6 +119,9 @@ class DeviceThread {
   int expected_max_outputs_ TF_GUARDED_BY(execution_mutex_);
   //   Outputs
   std::vector<TensorHandlePtr> op_outputs_ TF_GUARDED_BY(execution_mutex_);
+  // TF_Status is an incomplete type and so can't be stack allocated. To avoid
+  // unnecessary allocations each Execute call, we keep one heap-allocated
+  // version for the thread.
   StatusPtr status_ TF_GUARDED_BY(execution_mutex_);
 
   const std::string device_;
@@ -188,6 +192,9 @@ std::vector<TensorHandlePtr> DeviceThread::Join(TF_Status* status) {
     if (TF_GetCode(status_.get()) != TF_OK) {
       TF_SetStatus(status, TF_GetCode(status_.get()),
                    TF_Message(status_.get()));
+      // Reset the member `status_` so future op executions (after recovery from
+      // the bad `status`) start with an OK status.
+      TF_SetStatus(status_.get(), TF_OK, "");
     }
     execution_state_ = ExecutionState::kIdle;
     result = std::move(op_outputs_);
@@ -255,18 +262,27 @@ std::unique_ptr<ParallelTensor> ParallelDevice::CopyToParallelDevice(
                                            status);
 }
 
-std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
-    TFE_Context* context, TF_Status* status) const {
+std::unique_ptr<ParallelTensor> ParallelDevice::Vector(
+    TFE_Context* context, TF_Status* status,
+    absl::Span<const int32_t> values) const {
   // TODO(allenl): We could cache DeviceIDs (keyed by context).
   std::vector<TensorHandlePtr> components;
   components.reserve(underlying_devices_.size());
-  for (int device_index = 0; device_index < underlying_devices_.size();
+
+  if (values.size() != num_underlying_devices()) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        "Number of values did not match number of underlying devices.");
+    return nullptr;
+  }
+
+  for (int device_index = 0; device_index < num_underlying_devices();
        ++device_index) {
-    int32_t* device_id = new int32_t;
-    *device_id = device_index;
+    int32_t* device_value = new int32_t;
+    *device_value = values[device_index];
     std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
         TF_NewTensor(
-            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_value,
             sizeof(int32_t),
             [](void* data, size_t, void* arg) {
               delete reinterpret_cast<int32_t*>(data);
@@ -295,6 +311,16 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
                                            status);
 }
 
+std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
+    TFE_Context* context, TF_Status* status) const {
+  std::vector<int32_t> ids;
+  ids.reserve(num_underlying_devices());
+  for (int i = 0; i < num_underlying_devices(); ++i) {
+    ids.push_back(i);
+  }
+  return Vector(context, status, ids);
+}
+
 absl::optional<std::vector<std::unique_ptr<ParallelTensor>>>
 ParallelDevice::Execute(TFE_Context* context,
                         const std::vector<ParallelTensor*>& inputs,
@@ -319,21 +345,36 @@ ParallelDevice::Execute(TFE_Context* context,
                                 std::move(device_inputs), attributes,
                                 expected_max_outputs);
   }
+  StatusPtr first_bad_status(nullptr);
   for (int device_index = 0; device_index < underlying_devices_.size();
        ++device_index) {
     DeviceThread* device_thread = device_threads_[device_index].get();
     per_device_output_tensors.push_back(device_thread->Join(status));
-    if (TF_GetCode(status) != TF_OK) return result;
+    // We will run every Join even if there are bad statuses in case the user
+    // wants to recover and continue running ops on the parallel device (which
+    // would otherwise deadlock).
+    if (TF_GetCode(status) != TF_OK && first_bad_status == nullptr) {
+      first_bad_status.reset(TF_NewStatus());
+      TF_SetStatus(first_bad_status.get(), TF_GetCode(status),
+                   TF_Message(status));
+    }
+
     if (device_index == 0) {
       first_op_output_count = per_device_output_tensors.rbegin()->size();
     } else {
-      if (per_device_output_tensors.rbegin()->size() != first_op_output_count) {
-        TF_SetStatus(status, TF_INTERNAL,
+      if (first_bad_status == nullptr &&
+          per_device_output_tensors.rbegin()->size() != first_op_output_count) {
+        first_bad_status.reset(TF_NewStatus());
+        TF_SetStatus(first_bad_status.get(), TF_INTERNAL,
                      "Parallel ops produced different numbers of tensors.");
-        return result;
       }
     }
   }
+  if (first_bad_status != nullptr) {
+    TF_SetStatus(status, TF_GetCode(first_bad_status.get()),
+                 TF_Message(first_bad_status.get()));
+    return result;
+  }
   // For each output of the original operation, pack the per-device
   // TensorHandles we've computed into a single parallel TensorHandle.
   std::vector<std::unique_ptr<ParallelTensor>> per_device_outputs;
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
index cbfea31d95f..b3dc47ab088 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -61,6 +62,11 @@ class ParallelDevice {
                                                        TFE_TensorHandle* tensor,
                                                        TF_Status* status) const;
 
+  // Construct a parallel tensor consisting of the scalar values from `values`.
+  std::unique_ptr<ParallelTensor> Vector(
+      TFE_Context* context, TF_Status* status,
+      absl::Span<const int32_t> values) const;
+
   // A parallel tensor with scalar integers numbering component devices.
   std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
                                             TF_Status* status) const;
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib_test.cc
new file mode 100644
index 00000000000..35befe959cb
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace parallel_device {
+
+TEST(PARALLEL_DEVICE_LIB, TestOpWithError) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_Buffer, decltype(&TF_DeleteBuffer)> config(
+      TF_CreateConfig(
+          /*xla*/ false,
+          /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
+          2),
+      TF_DeleteBuffer);
+  TFE_ContextOptionsSetConfig(opts.get(), config->data, config->length,
+                              status.get());
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  std::vector<std::string> devices{
+      "/job:localhost/replica:0/task:0/device:CPU:0",
+      "/job:localhost/replica:0/task:0/device:CPU:1"};
+  ParallelDevice parallel_device(std::move(devices));
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> handle_op(
+      TFE_NewOp(context.get(), "VarHandleOp", status.get()), TFE_DeleteOp);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(handle_op.get(), "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(handle_op.get(), "shape", /*dims=*/nullptr, /*num_dims=*/0,
+                     status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  auto outputs =
+      parallel_device.Execute(context.get(), std::vector<ParallelTensor*>(),
+                              "VarHandleOp", TFE_OpGetAttrs(handle_op.get()),
+                              /*expected_max_outputs=*/1, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  const std::vector<std::unique_ptr<ParallelTensor>>& handles = *outputs;
+  std::vector<ParallelTensor*> handle_inputs;
+  handle_inputs.reserve(handles.size());
+  for (auto& handle : handles) {
+    handle_inputs.push_back(handle.get());
+  }
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> read_op(
+      TFE_NewOp(context.get(), "ReadVariableOp", status.get()), TFE_DeleteOp);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(read_op.get(), "dtype", TF_FLOAT);
+  parallel_device.Execute(context.get(), handle_inputs, "ReadVariableOp",
+                          TFE_OpGetAttrs(read_op.get()),
+                          /*expected_max_outputs=*/1, status.get());
+  ASSERT_FALSE(TF_GetCode(status.get()) == TF_OK);
+  TF_SetStatus(status.get(), TF_OK, "");
+
+  // Check that ops still run successfully on the device.
+  parallel_device.Execute(context.get(), std::vector<ParallelTensor*>(),
+                          "VarHandleOp", TFE_OpGetAttrs(handle_op.get()),
+                          /*expected_max_outputs=*/1, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+}
+
+}  // namespace parallel_device
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 27629bb3bdf..fcebe973500 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -146,13 +146,16 @@ class GradientTape {
   // once) and produces the gradient of the target tensors with respect to the
   // source tensors. The output gradients are used if not empty and not
   // null. The result is populated with one tensor per target element.
+  // When running backward functions, builds zeros-like tensors for
+  // incoming grads which are nullptrs, unless `build_default_zeros_grads`
+  // is set to false.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
       const gtl::ArraySlice<int64> target_tensor_ids,
       const gtl::ArraySlice<int64> source_tensor_ids,
       const std::unordered_map<int64, TapeTensor>& sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients,
-      std::vector<Gradient*>* result);
+      std::vector<Gradient*>* result, bool build_default_zeros_grads = true);
 
   bool IsPersistent() const { return persistent_; }
 
@@ -655,8 +658,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const gtl::ArraySlice<int64> target_tensor_ids,
     const gtl::ArraySlice<int64> source_tensor_ids,
     const std::unordered_map<int64, TapeTensor>& sources_that_are_targets,
-    gtl::ArraySlice<Gradient*> output_gradients,
-    std::vector<Gradient*>* result) {
+    gtl::ArraySlice<Gradient*> output_gradients, std::vector<Gradient*>* result,
+    bool build_default_zeros_grads) {
   std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
                                         source_tensor_ids.end());
   BackpropInitialState<BackwardFunction, TapeTensor> state = PrepareBackprop(
@@ -717,14 +720,14 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
       const int64 id = trace.output_tensor_info[i].GetID();
       auto grad_it = gradients.find(id);
       if (grad_it == gradients.end()) {
-        auto func_name_it =
-            FunctionsAcceptingNoneForIndicesMap()->find(trace.op_type);
-        if (func_name_it != FunctionsAcceptingNoneForIndicesMap()->end() &&
-            func_name_it->second.find(i) != func_name_it->second.end()) {
-          out_gradients.push_back(nullptr);
-        } else {
-          out_gradients.push_back(nullptr);
-          zero_indices.push_back(i);
+        out_gradients.push_back(nullptr);
+        if (build_default_zeros_grads) {
+          auto func_name_it =
+              FunctionsAcceptingNoneForIndicesMap()->find(trace.op_type);
+          if (func_name_it == FunctionsAcceptingNoneForIndicesMap()->end() ||
+              func_name_it->second.find(i) == func_name_it->second.end()) {
+            zero_indices.push_back(i);
+          }
         }
       } else {
         any_gradient_nonzero = true;
@@ -745,6 +748,7 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
       }
     }
     std::vector<Gradient*> in_gradients;
+    DCHECK(build_default_zeros_grads || zero_indices.empty());
     if (any_gradient_nonzero) {
       for (const auto i : zero_indices) {
         out_gradients[i] = trace.output_tensor_info[i].ZerosLike();
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/BUILD b/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
index 56bd3b4a75c..a2108d06cbb 100644
--- a/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/BUILD
@@ -26,6 +26,8 @@ cc_library(
     }),
     deps = [
         ":aws_crypto",
+        ":aws_logging",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@aws",
@@ -45,6 +47,18 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "aws_logging",
+    srcs = ["aws_logging.cc"],
+    hdrs = ["aws_logging.h"],
+    deps = [
+        "//tensorflow/c:logging",
+        "@aws",
+        "@com_google_absl//absl/synchronization",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "s3_filesystem_test",
     srcs = [
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.cc b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.cc
new file mode 100644
index 00000000000..353b733fd25
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.cc
@@ -0,0 +1,159 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h"
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/c/logging.h"
+
+static constexpr char kAWSLoggingTag[] = "AWSLogging";
+
+static const std::map<const std::string, const Aws::Utils::Logging::LogLevel>
+    log_levels_string_to_aws = {
+        {"off", Aws::Utils::Logging::LogLevel::Off},
+        {"fatal", Aws::Utils::Logging::LogLevel::Fatal},
+        {"error", Aws::Utils::Logging::LogLevel::Error},
+        {"warn", Aws::Utils::Logging::LogLevel::Warn},
+        {"info", Aws::Utils::Logging::LogLevel::Info},
+        {"debug", Aws::Utils::Logging::LogLevel::Debug},
+        {"trace", Aws::Utils::Logging::LogLevel::Trace}};
+
+static const std::map<const int, const Aws::Utils::Logging::LogLevel>
+    log_levels_tf_to_aws = {{0, Aws::Utils::Logging::LogLevel::Info},
+                            {1, Aws::Utils::Logging::LogLevel::Warn},
+                            {2, Aws::Utils::Logging::LogLevel::Error},
+                            {3, Aws::Utils::Logging::LogLevel::Fatal}};
+
+namespace tf_s3_filesystem {
+
+AWSLogSystem::AWSLogSystem(Aws::Utils::Logging::LogLevel log_level)
+    : log_level_(log_level) {}
+
+void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                              const std::string& message) {
+  if (message == "Initializing Curl library") return;
+  switch (log_level) {
+    case Aws::Utils::Logging::LogLevel::Info:
+      TF_Log(TF_INFO, message.c_str());
+      break;
+    case Aws::Utils::Logging::LogLevel::Warn:
+      TF_Log(TF_WARNING, message.c_str());
+      break;
+    case Aws::Utils::Logging::LogLevel::Error:
+      TF_Log(TF_ERROR, message.c_str());
+      break;
+    case Aws::Utils::Logging::LogLevel::Fatal:
+      TF_Log(TF_FATAL, message.c_str());
+      break;
+    default:
+      // this will match for DEBUG, TRACE
+      TF_Log(TF_INFO, message.c_str());
+      break;
+  }
+}
+
+void AWSLogSystem::Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                       const char* format, ...) {
+  char buffer[256];
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buffer, 256, format, args);
+  va_end(args);
+  LogMessage(log_level, buffer);
+}
+
+void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
+                             const char* tag,
+                             const Aws::OStringStream& message_stream) {
+  LogMessage(log_level, message_stream.rdbuf()->str().c_str());
+}
+
+void AWSLogSystem::Flush() { return; }
+
+static Aws::Utils::Logging::LogLevel TfLogLevelToAwsLogLevel(int level) {
+  // Converts TF Log Levels INFO, WARNING, ERROR and FATAL to the AWS enum
+  // values for the levels
+  if (log_levels_tf_to_aws.find(level) != log_levels_tf_to_aws.end()) {
+    return log_levels_tf_to_aws.at(level);
+  } else {
+    // default to fatal
+    return Aws::Utils::Logging::LogLevel::Fatal;
+  }
+}
+
+static Aws::Utils::Logging::LogLevel ParseAwsLogLevelFromEnv() {
+  // defaults to FATAL log level for the AWS SDK
+  // this is because many normal tensorflow operations are logged as errors in
+  // the AWS SDK such as checking if a file exists can log an error in AWS SDK
+  // if the file does not actually exist. Another such case is when reading a
+  // file till the end, TensorFlow expects to see an InvalidRange exception at
+  // the end, but this would be an error in the AWS SDK. This confuses users,
+  // hence the default setting.
+  Aws::Utils::Logging::LogLevel log_level =
+      Aws::Utils::Logging::LogLevel::Fatal;
+
+  const char* aws_env_var_val = getenv("AWS_LOG_LEVEL");
+  if (aws_env_var_val != nullptr) {
+    std::string maybe_integer_str(aws_env_var_val, strlen(aws_env_var_val));
+    std::istringstream ss(maybe_integer_str);
+    int level;
+    ss >> level;
+    if (ss.fail()) {
+      // wasn't a number
+      // expecting a string
+      std::string level_str = maybe_integer_str;
+      if (log_levels_string_to_aws.find(level_str) !=
+          log_levels_string_to_aws.end()) {
+        log_level = log_levels_string_to_aws.at(level_str);
+      }
+    } else {
+      // backwards compatibility
+      // valid number, but this number follows the standard TensorFlow log
+      // levels need to convert this to AWS SDK logging level number
+      log_level = TfLogLevelToAwsLogLevel(level);
+    }
+  }
+  return log_level;
+}
+
+static bool initialized = false;
+ABSL_CONST_INIT static absl::Mutex s3_logging_mutex(absl::kConstInit);
+void AWSLogSystem::InitializeAWSLogging() {
+  absl::MutexLock l(&s3_logging_mutex);
+  if (!initialized) {
+    Aws::Utils::Logging::InitializeAWSLogging(Aws::MakeShared<AWSLogSystem>(
+        kAWSLoggingTag, ParseAwsLogLevelFromEnv()));
+    initialized = true;
+    return;
+  }
+}
+
+void AWSLogSystem::ShutdownAWSLogging() {
+  absl::MutexLock l(&s3_logging_mutex);
+  if (initialized) {
+    Aws::Utils::Logging::ShutdownAWSLogging();
+    initialized = false;
+    return;
+  }
+}
+
+}  // namespace tf_s3_filesystem
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h
new file mode 100644
index 00000000000..afecd7e5e62
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_LOGGING_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_LOGGING_H_
+
+#include <aws/core/utils/logging/LogLevel.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <atomic>
+#include <string>
+
+namespace tf_s3_filesystem {
+
+class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
+ public:
+  static void InitializeAWSLogging();
+  static void ShutdownAWSLogging();
+
+  explicit AWSLogSystem(Aws::Utils::Logging::LogLevel log_level);
+  virtual ~AWSLogSystem() = default;
+
+  // Gets the currently configured log level.
+  Aws::Utils::Logging::LogLevel GetLogLevel(void) const override {
+    return log_level_;
+  }
+
+  // Set a new log level. This has the immediate effect of changing the log.
+  void SetLogLevel(Aws::Utils::Logging::LogLevel log_level) {
+    log_level_.store(log_level);
+  }
+
+  // Does a printf style output to ProcessFormattedStatement. Don't use this,
+  // it's unsafe. See LogStream.
+  void Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+           const char* format, ...) override;
+
+  // Writes the stream to ProcessFormattedStatement.
+  void LogStream(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                 const Aws::OStringStream& messageStream) override;
+
+  // Flushes the buffered messages if the logger supports buffering
+  void Flush() override;
+
+ private:
+  void LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                  const std::string& message);
+  std::atomic<Aws::Utils::Logging::LogLevel> log_level_;
+};
+
+}  // namespace tf_s3_filesystem
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_S3_AWS_LOGGING_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
index 7e1b36f2dcc..9ff07633f2a 100644
--- a/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/s3/s3_filesystem.cc
@@ -38,6 +38,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/plugins/s3/aws_crypto.h"
+#include "tensorflow/c/experimental/filesystem/plugins/s3/aws_logging.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for S3 environments.
@@ -186,6 +188,8 @@ static void GetS3Client(tf_s3_filesystem::S3File* s3_file) {
   absl::MutexLock l(&s3_file->initialization_lock);
 
   if (s3_file->s3_client.get() == nullptr) {
+    tf_s3_filesystem::AWSLogSystem::InitializeAWSLogging();
+
     Aws::SDKOptions options;
     options.cryptoOptions.sha256Factory_create_fn = []() {
       return Aws::MakeShared<tf_s3_filesystem::AWSSHA256Factory>(
@@ -250,6 +254,7 @@ static void ShutdownClient(Aws::S3::S3Client* s3_client) {
     delete s3_client;
     Aws::SDKOptions options;
     Aws::ShutdownAPI(options);
+    tf_s3_filesystem::AWSLogSystem::ShutdownAWSLogging();
   }
 }
 
@@ -281,6 +286,7 @@ void Cleanup(TF_RandomAccessFile* file) {
 
 static int64_t ReadS3Client(S3File* s3_file, uint64_t offset, size_t n,
                             char* buffer, TF_Status* status) {
+  TF_VLog(3, "ReadFile using S3Client\n");
   Aws::S3::Model::GetObjectRequest get_object_request;
   get_object_request.WithBucket(s3_file->bucket).WithKey(s3_file->object);
   Aws::String bytes =
@@ -306,12 +312,14 @@ static int64_t ReadS3Client(S3File* s3_file, uint64_t offset, size_t n,
 
 static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
                                      char* buffer, TF_Status* status) {
+  TF_VLog(3, "Using TransferManager\n");
   auto create_download_stream = [&]() {
     return Aws::New<TFS3UnderlyingStream>(
         "S3ReadStream",
         Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
             "S3ReadStream", reinterpret_cast<unsigned char*>(buffer), n));
   };
+  TF_VLog(3, "Created stream to read with transferManager\n");
   auto handle = s3_file->transfer_manager->DownloadFile(
       s3_file->bucket, s3_file->object, offset, n, create_download_stream);
   handle->WaitUntilFinished();
@@ -322,6 +330,10 @@ static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
              Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE &&
          retries++ < kDownloadRetries) {
     // Only failed parts will be downloaded again.
+    TF_VLog(
+        1,
+        "Retrying read of s3://%s/%s after failure. Current retry count: %u\n",
+        s3_file->bucket.c_str(), s3_file->object.c_str(), retries);
     s3_file->transfer_manager->RetryDownload(handle);
     handle->WaitUntilFinished();
   }
@@ -341,6 +353,8 @@ static int64_t ReadS3TransferManager(S3File* s3_file, uint64_t offset, size_t n,
 int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
              char* buffer, TF_Status* status) {
   auto s3_file = static_cast<S3File*>(file->plugin_file);
+  TF_VLog(1, "ReadFilefromS3 s3://%s/%s from %u for n: %u\n",
+          s3_file->bucket.c_str(), s3_file->object.c_str(), offset, n);
   if (s3_file->use_multi_part_download)
     return ReadS3TransferManager(s3_file, offset, n, buffer, status);
   else
@@ -416,6 +430,8 @@ void Sync(const TF_WritableFile* file, TF_Status* status) {
     TF_SetStatus(status, TF_OK, "");
     return;
   }
+  TF_VLog(1, "WriteFileToS3: s3://%s/%s\n", s3_file->bucket.c_str(),
+          s3_file->object.c_str());
   auto position = static_cast<int64_t>(s3_file->outfile->tellp());
   auto handle = s3_file->transfer_manager->UploadFile(
       s3_file->outfile, s3_file->bucket, s3_file->object,
@@ -426,6 +442,10 @@ void Sync(const TF_WritableFile* file, TF_Status* status) {
   while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
          retries++ < kUploadRetries) {
     // if multipart upload was used, only the failed parts will be re-sent
+    TF_VLog(1,
+            "Retrying upload of s3://%s/%s after failure. Current retry count: "
+            "%u\n",
+            s3_file->bucket.c_str(), s3_file->object.c_str(), retries);
     s3_file->transfer_manager->RetryUpload(s3_file->outfile, handle);
     handle->WaitUntilFinished();
   }
@@ -613,6 +633,7 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
 
 void Stat(const TF_Filesystem* filesystem, const char* path,
           TF_FileStatistics* stats, TF_Status* status) {
+  TF_VLog(1, "Stat on path: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -737,6 +758,8 @@ static void SimpleCopyFile(const Aws::String& source,
                            const Aws::String& bucket_dst,
                            const Aws::String& object_dst, S3File* s3_file,
                            TF_Status* status) {
+  TF_VLog(1, "SimpleCopyFile from %s to %s/%s\n", bucket_dst.c_str(),
+          object_dst.c_str());
   Aws::S3::Model::CopyObjectRequest copy_object_request;
   copy_object_request.WithCopySource(source)
       .WithBucket(bucket_dst)
@@ -801,6 +824,8 @@ static void MultiPartCopy(const Aws::String& source,
                           const Aws::String& object_dst, const size_t num_parts,
                           const uint64_t file_size, S3File* s3_file,
                           TF_Status* status) {
+  TF_VLog(1, "MultiPartCopy from %s to %s/%s\n", bucket_dst.c_str(),
+          object_dst.c_str());
   Aws::S3::Model::CreateMultipartUploadRequest create_multipart_upload_request;
   create_multipart_upload_request.WithBucket(bucket_dst).WithKey(object_dst);
 
@@ -827,6 +852,8 @@ static void MultiPartCopy(const Aws::String& source,
   auto chunk_size =
       s3_file->multi_part_chunk_sizes[Aws::Transfer::TransferDirection::UPLOAD];
 
+  TF_VLog(1, "Copying from %s in %u parts of size %u each\n", source.c_str(),
+          num_parts, chunk_size);
   size_t retries = 0;
   while (retries++ < 3) {
     // Queue up parts.
@@ -891,6 +918,9 @@ static void MultiPartCopy(const Aws::String& source,
                                           status);
         } else {
           // Retry.
+          TF_Log(TF_ERROR,
+                 "Retrying failed copy of part %u due to an error with S3\n",
+                 part_number);
           num_finished_parts--;
         }
       }
@@ -967,6 +997,7 @@ void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
 
 void DeleteFile(const TF_Filesystem* filesystem, const char* path,
                 TF_Status* status) {
+  TF_VLog(1, "DeleteFile: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -985,6 +1016,7 @@ void DeleteFile(const TF_Filesystem* filesystem, const char* path,
 
 void CreateDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
+  TF_VLog(1, "CreateDir: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -1026,6 +1058,7 @@ void CreateDir(const TF_Filesystem* filesystem, const char* path,
 
 void DeleteDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
+  TF_VLog(1, "DeleteDir: %s\n", path);
   Aws::String bucket, object;
   ParseS3Path(path, false, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -1060,6 +1093,7 @@ void DeleteDir(const TF_Filesystem* filesystem, const char* path,
 
 void RenameFile(const TF_Filesystem* filesystem, const char* src,
                 const char* dst, TF_Status* status) {
+  TF_VLog(1, "RenameFile from: %s to %s\n", src, dst);
   Aws::String bucket_src, object_src;
   ParseS3Path(src, false, &bucket_src, &object_src, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -1120,6 +1154,7 @@ void RenameFile(const TF_Filesystem* filesystem, const char* src,
 
 int GetChildren(const TF_Filesystem* filesystem, const char* path,
                 char*** entries, TF_Status* status) {
+  TF_VLog(1, "GetChildren for path: %s\n", path);
   Aws::String bucket, prefix;
   ParseS3Path(path, true, &bucket, &prefix, status);
   if (TF_GetCode(status) != TF_OK) return -1;
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 80c4e8d9791..36a3251def7 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -3,6 +3,24 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "array_grad",
+    srcs = ["array_grad.cc"],
+    hdrs = [
+        "array_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/c/eager:gradients",
+        "//tensorflow/core/lib/llvm_rtti",
+    ],
+)
+
 cc_library(
     name = "math_grad",
     srcs = ["math_grad.cc"],
@@ -19,6 +37,28 @@ cc_library(
         "//tensorflow/c/eager:gradients",
         "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/core/lib/llvm_rtti",
+    ],
+)
+
+cc_library(
+    name = "nn_grad",
+    srcs = ["nn_grad.cc"],
+    hdrs = [
+        "nn_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/c/eager:gradients",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
         "//tensorflow/core/lib/llvm_rtti",
     ],
 )
diff --git a/tensorflow/c/experimental/gradients/array_grad.cc b/tensorflow/c/experimental/gradients/array_grad.cc
new file mode 100644
index 00000000000..069209a4b6b
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/array_grad.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/array_grad.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace {
+using std::vector;
+class IdentityNGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    grad_outputs->resize(grad_inputs.size(), nullptr);
+    for (int i = 0; i < grad_inputs.size(); i++) {
+      auto grad_input = grad_inputs[i];
+      // TODO(srbs): Should we add a copy contructor to AbstractTensorHandle
+      // that takes care of this similar to `Tensor`?
+      if (grad_input) {
+        grad_input->Ref();
+      }
+      (*grad_outputs)[i] = grad_input;
+    }
+    return Status::OK();
+  }
+  ~IdentityNGradientFunction() override {}
+};
+}  // namespace
+
+BackwardFunction* IdentityNRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new IdentityNGradientFunction;
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/array_grad.h b/tensorflow/c/experimental/gradients/array_grad.h
new file mode 100644
index 00000000000..edeeb5fcb4a
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/array_grad.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+BackwardFunction* IdentityNRegisterer(const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_ARRAY_GRAD_H_
diff --git a/tensorflow/c/experimental/gradients/math_grad.cc b/tensorflow/c/experimental/gradients/math_grad.cc
index d8b70848d4e..f298c202046 100644
--- a/tensorflow/c/experimental/gradients/math_grad.cc
+++ b/tensorflow/c/experimental/gradients/math_grad.cc
@@ -15,13 +15,17 @@ limitations under the License.
 #include "tensorflow/c/experimental/gradients/math_grad.h"
 
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/gradients.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
 
 using std::vector;
 using tensorflow::ops::Conj;
 using tensorflow::ops::Identity;
+using tensorflow::ops::MatMul;
 using tensorflow::ops::Mul;
+using tensorflow::ops::ZerosLike;
 
 namespace tensorflow {
 namespace gradients {
@@ -29,20 +33,23 @@ namespace {
 
 class AddGradientFunction : public GradientFunction {
  public:
-  Status Compute(Context* ctx,
-                 absl::Span<AbstractTensorHandle* const> grad_inputs,
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
                  vector<AbstractTensorHandle*>* grad_outputs) override {
     grad_outputs->resize(2);
     vector<AbstractTensorHandle*> identity_outputs(1);
     // TODO(b/145674566): Handle name unification in tracing code.
     // TODO(b/161805092): Support broadcasting.
+
+    std::string name = "Identity_A";
     TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
                                      absl::MakeSpan(identity_outputs),
-                                     "Identity0"));
+                                     name.c_str()));
     (*grad_outputs)[0] = identity_outputs[0];
+
+    name = "Identity_B";
     TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
                                      absl::MakeSpan(identity_outputs),
-                                     "Identity1"));
+                                     name.c_str()));
     (*grad_outputs)[1] = identity_outputs[0];
     return Status::OK();
   }
@@ -54,16 +61,18 @@ class ExpGradientFunction : public GradientFunction {
   explicit ExpGradientFunction(AbstractTensorHandle* exp) : exp_(exp) {
     exp->Ref();
   }
-  Status Compute(Context* ctx,
-                 absl::Span<AbstractTensorHandle* const> grad_inputs,
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
                  vector<AbstractTensorHandle*>* grad_outputs) override {
     vector<AbstractTensorHandle*> conj_outputs(1);
-    TF_RETURN_IF_ERROR(
-        Conj(ctx->ctx, {exp_.get()}, absl::MakeSpan(conj_outputs), "ExpConj"));
+    std::string name = "Conj_Exp_Grad";
+    TF_RETURN_IF_ERROR(Conj(ctx->ctx, {exp_.get()},
+                            absl::MakeSpan(conj_outputs), name.c_str()));
     AbstractTensorHandlePtr conj_output_releaser(conj_outputs[0]);
     grad_outputs->resize(1);
+
+    name = "Mul_Exp_Grad";
     TF_RETURN_IF_ERROR(Mul(ctx->ctx, {conj_outputs[0], grad_inputs[0]},
-                           absl::MakeSpan(*grad_outputs), "ExpGradMul"));
+                           absl::MakeSpan(*grad_outputs), name.c_str()));
     return Status::OK();
   }
   ~ExpGradientFunction() override {}
@@ -72,14 +81,142 @@ class ExpGradientFunction : public GradientFunction {
   AbstractTensorHandlePtr exp_;
 };
 
+class MatMulGradientFunction : public GradientFunction {
+ public:
+  explicit MatMulGradientFunction(vector<AbstractTensorHandle*> f_inputs,
+                                  AttrBuilder f_attrs)
+      : forward_inputs(f_inputs), forward_attrs(f_attrs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a matmul op A*B, the gradients are:
+     *
+     *    dA = U * B.T
+     *    dB = A.T * U
+     *
+     *    where A.T means `transpose(A)`
+     */
+    AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    grad_outputs->resize(2);
+
+    // Get transpose attrs
+    bool t_a;
+    forward_attrs.Get("transpose_a", &t_a);
+
+    bool t_b;
+    forward_attrs.Get("transpose_b", &t_b);
+
+    // Conj each input
+    vector<AbstractTensorHandle*> conj_outputs(1);
+    std::string name = "Conj_A_MatMul_Grad";
+    TF_RETURN_IF_ERROR(Conj(ctx->ctx, {forward_inputs[0]},
+                            absl::MakeSpan(conj_outputs), name.c_str()));
+
+    AbstractTensorHandle* A = conj_outputs[0];
+
+    name = "Conj_B_MatMul_Grad";
+    TF_RETURN_IF_ERROR(Conj(ctx->ctx, {forward_inputs[1]},
+                            absl::MakeSpan(conj_outputs), name.c_str()));
+
+    AbstractTensorHandle* B = conj_outputs[0];
+
+    // Calc Grad
+    vector<AbstractTensorHandle*> matmul_A_outputs(1);
+    vector<AbstractTensorHandle*> matmul_B_outputs(1);
+    std::string name_grad_A = "MatMul_Grad_A";
+    std::string name_grad_B = "MatMul_Grad_B";
+    if (!t_a && !t_b) {
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, B},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ true));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {A, upstream_grad},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ false));
+    } else if (!t_a && t_b) {
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, B},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ false));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, A},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ false));
+
+    } else if (t_a && !t_b) {
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {B, upstream_grad},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ true));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {A, upstream_grad},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ false,
+                                /*transpose_b = */ false));
+    } else {  // t_a && t_b
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {B, upstream_grad},
+                                absl::MakeSpan(matmul_A_outputs),
+                                name_grad_A.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ true));
+
+      TF_RETURN_IF_ERROR(MatMul(ctx->ctx, {upstream_grad, A},
+                                absl::MakeSpan(matmul_B_outputs),
+                                name_grad_B.c_str(),
+                                /*transpose_a = */ true,
+                                /*transpose_b = */ true));
+    }
+
+    // Gradient for A
+    (*grad_outputs)[0] = matmul_A_outputs[0];
+
+    // Gradient for B
+    (*grad_outputs)[1] = matmul_B_outputs[0];
+    return Status::OK();
+  }
+  ~MatMulGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_inputs;
+  AttrBuilder forward_attrs;
+};
+
 }  // namespace
 
-GradientFunction* AddRegisterer(const ForwardOperation& op) {
-  return new AddGradientFunction;
+BackwardFunction* AddRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new AddGradientFunction;
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
 }
 
-GradientFunction* ExpRegisterer(const ForwardOperation& op) {
-  return new ExpGradientFunction(op.outputs[0]);
+BackwardFunction* ExpRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new ExpGradientFunction(op.outputs[0]);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* MatMulRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new MatMulGradientFunction(op.inputs, op.attrs);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
 }
 
 }  // namespace gradients
diff --git a/tensorflow/c/experimental/gradients/math_grad.h b/tensorflow/c/experimental/gradients/math_grad.h
index 6c7242a1a49..205419e1201 100644
--- a/tensorflow/c/experimental/gradients/math_grad.h
+++ b/tensorflow/c/experimental/gradients/math_grad.h
@@ -19,9 +19,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace gradients {
-GradientFunction* AddRegisterer(const ForwardOperation& op);
-GradientFunction* ExpRegisterer(const ForwardOperation& op);
+BackwardFunction* AddRegisterer(const ForwardOperation& op);
+BackwardFunction* ExpRegisterer(const ForwardOperation& op);
+BackwardFunction* MatMulRegisterer(const ForwardOperation& op);
 }  // namespace gradients
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
\ No newline at end of file
diff --git a/tensorflow/c/experimental/gradients/nn_grad.cc b/tensorflow/c/experimental/gradients/nn_grad.cc
new file mode 100644
index 00000000000..3da1e0dc153
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/nn_grad.cc
@@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+
+using std::vector;
+using tensorflow::ops::Conj;
+using tensorflow::ops::Identity;
+using tensorflow::ops::Mul;
+using tensorflow::ops::ReluGrad;
+using tensorflow::ops::SparseSoftmaxCrossEntropyLoss;
+using tensorflow::ops::ZerosLike;
+
+namespace tensorflow {
+namespace gradients {
+namespace {
+
+class ReluGradientFunction : public GradientFunction {
+ public:
+  explicit ReluGradientFunction(vector<AbstractTensorHandle*> f_outputs)
+      : forward_outputs(f_outputs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    AbstractTensorHandle* upstream_grad = grad_inputs[0];
+    AbstractTensorHandle* activations = forward_outputs[0];
+    grad_outputs->resize(1);
+    vector<AbstractTensorHandle*> relugrad_outputs(1);
+
+    // Calculate Grad
+    std::string name = "relu_grad";
+
+    TF_RETURN_IF_ERROR(ReluGrad(ctx->ctx, {upstream_grad, activations},
+                                absl::MakeSpan(relugrad_outputs),
+                                name.c_str()));
+    (*grad_outputs)[0] = relugrad_outputs[0];
+
+    return Status::OK();
+  }
+  ~ReluGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_outputs;
+};
+
+class SparseSoftmaxCrossEntropyLossGradientFunction : public GradientFunction {
+ public:
+  explicit SparseSoftmaxCrossEntropyLossGradientFunction(
+      vector<AbstractTensorHandle*> f_outputs)
+      : forward_outputs(f_outputs) {}
+
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    grad_outputs->resize(2);
+
+    // Grad for Softmax Input
+    std::string name = "Mul_Softmax_Grad";
+    vector<AbstractTensorHandle*> mul_outputs(1);
+    TF_RETURN_IF_ERROR(
+        ops::Mul(ctx->ctx, {grad_inputs[0], forward_outputs[1]},
+                 absl::MakeSpan(mul_outputs),
+                 name.c_str()));  // upstream_grad * local softmax grad
+    (*grad_outputs)[0] = mul_outputs[0];
+
+    // Grad for labels is null
+    (*grad_outputs)[1] = nullptr;
+
+    return Status::OK();
+  }
+  ~SparseSoftmaxCrossEntropyLossGradientFunction() override {}
+
+ private:
+  vector<AbstractTensorHandle*> forward_outputs;
+};
+
+}  // namespace
+
+BackwardFunction* ReluRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new ReluGradientFunction(op.outputs);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* SparseSoftmaxCrossEntropyLossRegisterer(
+    const ForwardOperation& op) {
+  auto gradient_function =
+      new SparseSoftmaxCrossEntropyLossGradientFunction(op.outputs);
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/nn_grad.h b/tensorflow/c/experimental/gradients/nn_grad.h
new file mode 100644
index 00000000000..d002725847f
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/nn_grad.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
+
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+BackwardFunction* ReluRegisterer(const ForwardOperation& op);
+BackwardFunction* SparseSoftmaxCrossEntropyLossRegisterer(
+    const ForwardOperation& op);
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
\ No newline at end of file
diff --git a/tensorflow/c/experimental/ops/BUILD b/tensorflow/c/experimental/ops/BUILD
index d13d7a72d3e..3504737c314 100644
--- a/tensorflow/c/experimental/ops/BUILD
+++ b/tensorflow/c/experimental/ops/BUILD
@@ -15,7 +15,6 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "//tensorflow/c/eager:abstract_context",
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
@@ -36,12 +35,30 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":array_ops",
-        "//tensorflow/c/eager:abstract_context",
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "nn_ops",
+    srcs = [
+        "nn_ops.cc",
+    ],
+    hdrs = [
+        "nn_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
         "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:errors",
     ],
diff --git a/tensorflow/c/experimental/ops/array_ops.cc b/tensorflow/c/experimental/ops/array_ops.cc
index ab2d114d9d9..df0f4639fbd 100644
--- a/tensorflow/c/experimental/ops/array_ops.cc
+++ b/tensorflow/c/experimental/ops/array_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace ops {
-// Creates an Identity op.
+
 Status Identity(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
                 absl::Span<AbstractTensorHandle*> outputs, const char* name) {
@@ -35,5 +35,19 @@ Status Identity(AbstractContext* ctx,
   return identity_op->Execute(outputs, &num_retvals);
 }
 
+Status ZerosLike(AbstractContext* ctx,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr z_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(z_op->Reset("ZerosLike", /*raw_device_name=*/nullptr));
+  if (isa<tensorflow::tracing::TracingOperation>(z_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(z_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(z_op->AddInput(inputs[0]));
+  int num_retvals = 1;
+  return z_op->Execute(outputs, &num_retvals);
+}
+
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/array_ops.h b/tensorflow/c/experimental/ops/array_ops.h
index 226461fd286..8dc68db673f 100644
--- a/tensorflow/c/experimental/ops/array_ops.h
+++ b/tensorflow/c/experimental/ops/array_ops.h
@@ -22,9 +22,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace ops {
+
 Status Identity(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status ZerosLike(AbstractContext* ctx,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/ops/math_ops.cc b/tensorflow/c/experimental/ops/math_ops.cc
index e91acbd6370..82c2f0e8169 100644
--- a/tensorflow/c/experimental/ops/math_ops.cc
+++ b/tensorflow/c/experimental/ops/math_ops.cc
@@ -51,5 +51,60 @@ Status Conj(AbstractContext* ctx,
   return Status::OK();
 }
 
+Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr add_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(add_op->Reset("AddV2", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(add_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(add_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(add_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(add_op->AddInput(inputs[1]));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(add_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status MatMul(AbstractContext* ctx,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a = false, bool transpose_b = false) {
+  AbstractOperationPtr matmul_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(matmul_op->Reset("MatMul", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(matmul_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(matmul_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(matmul_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(matmul_op->AddInput(inputs[1]));
+
+  TF_RETURN_IF_ERROR(matmul_op->SetAttrBool("transpose_a", transpose_a));
+  TF_RETURN_IF_ERROR(matmul_op->SetAttrBool("transpose_b", transpose_b));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(matmul_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status Neg(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr neg_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(neg_op->Reset("Neg", /*raw_device_name=*/nullptr));
+  if (isa<TracingOperation>(neg_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<TracingOperation>(neg_op.get())->SetOpName(name));
+  }
+  TF_RETURN_IF_ERROR(neg_op->AddInput(inputs[0]));
+
+  int num_retvals = 1;
+  return neg_op->Execute(outputs, &num_retvals);
+}
+
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/math_ops.h b/tensorflow/c/experimental/ops/math_ops.h
index 4d7c3d838ce..ed1e6c5b3d6 100644
--- a/tensorflow/c/experimental/ops/math_ops.h
+++ b/tensorflow/c/experimental/ops/math_ops.h
@@ -25,6 +25,15 @@ Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
 Status Conj(AbstractContext* ctx,
             absl::Span<AbstractTensorHandle* const> inputs,
             absl::Span<AbstractTensorHandle*> outputs, const char* name);
+Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+Status MatMul(AbstractContext* ctx,
+              absl::Span<AbstractTensorHandle* const> inputs,
+              absl::Span<AbstractTensorHandle*> outputs, const char* name,
+              bool transpose_a, bool transpose_b);
+Status Neg(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/ops/nn_ops.cc b/tensorflow/c/experimental/ops/nn_ops.cc
new file mode 100644
index 00000000000..8f5f550bb8b
--- /dev/null
+++ b/tensorflow/c/experimental/ops/nn_ops.cc
@@ -0,0 +1,67 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace ops {
+
+// Softmax Loss given scores and labels, used by the SoftMaxLossGradient
+Status SparseSoftmaxCrossEntropyLoss(
+    AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sm_loss_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sm_loss_op->Reset("SparseSoftmaxCrossEntropyWithLogits",
+                                       /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(sm_loss_op.get())) {
+    TF_RETURN_IF_ERROR(
+        dyn_cast<tracing::TracingOperation>(sm_loss_op.get())->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[0]));  // input scores
+  TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[1]));  // labels
+
+  // Outputs will contain: [loss_vals, gradients].
+  int num_retvals = 2;
+  TF_RETURN_IF_ERROR(sm_loss_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+// Computes Relu gradient given input features
+Status ReluGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr relugrad_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      relugrad_op->Reset("ReluGrad", /*raw_device_name=*/nullptr));
+
+  if (isa<tracing::TracingOperation>(relugrad_op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(relugrad_op.get())
+                           ->SetOpName(name));
+  }
+
+  TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[0]));  // upstream grads
+  TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[1]));  // relu inputs
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(relugrad_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/nn_ops.h b/tensorflow/c/experimental/ops/nn_ops.h
new file mode 100644
index 00000000000..3e618b00869
--- /dev/null
+++ b/tensorflow/c/experimental/ops/nn_ops.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
+
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+
+namespace tensorflow {
+namespace ops {
+
+Status SparseSoftmaxCrossEntropyLoss(
+    AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+    absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status ReluGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+}  // namespace ops
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_OPS_NN_OPS_H_
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index b2e432782de..2feb7c1b33e 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -44,7 +44,9 @@ cc_library(
     ],
     deps = [
         ":concrete_function",
+        ":signature_def_function",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -70,6 +72,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "signature_def_function",
+    hdrs = [
+        "signature_def_function.h",
+    ],
+    deps = [
+        ":signature_def_function_metadata",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata",
+    hdrs = [
+        "signature_def_function_metadata.h",
+    ],
+)
+
 cc_library(
     name = "test_utils",
     testonly = True,
@@ -115,6 +137,7 @@ cc_library(
         ":concrete_function",
         ":saved_model_api",
         ":saved_model_utils",
+        ":signature_def_function",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
@@ -206,13 +229,13 @@ tf_cc_test(
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime:core_cpu_lib",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:core",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
     ],
 )
 
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
index da3a64b91a3..934fa6d2bda 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -26,10 +26,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Note that ConcreteFunctions's lifetimes are effectively bound
-// to the SavedModel they are loaded from, since they retain pointers
-// to the TensorHandles owned by the SavedModel, and the FunctionDef
-// of the SavedModel.
+// ConcreteFunctions correspond to an instance of a tf.function with a known set
+// of inputs (either through get_concrete_function) or an input_signature.
+// ConcreteFunction attempts to preserve the user-facing semantics of the
+// tf.function python API and can take a limited set of types as arguments
+// (to be modeled in tensorflow::Value), not just Tensors.
+// SavedModelAPI's ConcreteFunctions' lifetimes are bound to the SavedModel they
+// are loaded from, since they retain pointers to the TensorHandles owned by the
+// SavedModel, and the FunctionDef of the SavedModel.
 // Note(bmzhao): This class is only TEMPORARILY virtual, as a way to unblock
 // TFRT integration with TF Serving. Do not add more virtual implementations of
 // this class. Eventually we want to remove this virtual base class indirection
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index 492a58f816d..be9ffff99ff 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -37,10 +37,11 @@ static const char kNoSharingResourceID[] =
 
 Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
+                                           const char* raw_device_name,
                                            ImmediateTensorHandlePtr* handle) {
   ImmediateOpPtr varhandle_op(ctx->CreateOperation());
 
-  TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", nullptr));
+  TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", raw_device_name));
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrType("dtype", dtype));
 
   // Note that if shape is unknown rank, shape.dim_sizes() will be empty, and
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
index 13c941a77fe..accad1591da 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -31,6 +31,7 @@ namespace internal {
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
 Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
+                                           const char* raw_device_name,
                                            ImmediateTensorHandlePtr* handle);
 
 // Executes an AssignVariableOp using `ctx`, assigning the variable associated
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
index 55a4a32e983..5ce027fe6d8 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
@@ -55,7 +55,7 @@ TEST_F(VariableOpsTest, CreateVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
   ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
-      context(), DT_FLOAT, {}, &handle));
+      context(), DT_FLOAT, {}, nullptr, &handle));
   // The created TensorHandle should be a DT_Resource
   EXPECT_EQ(handle->DataType(), DT_RESOURCE);
 }
@@ -65,7 +65,7 @@ TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
   ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
-      context(), DT_FLOAT, {}, &handle));
+      context(), DT_FLOAT, {}, nullptr, &handle));
 
   // Destroy the variable
   TF_EXPECT_OK(internal::DestroyResource(context(), handle.get()));
@@ -76,7 +76,7 @@ TEST_F(VariableOpsTest, AssignVariableAndReadSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
   ImmediateTensorHandlePtr variable;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
-      context(), DT_FLOAT, {}, &variable));
+      context(), DT_FLOAT, {}, nullptr, &variable));
 
   // Create a Scalar float TensorHandle with value 42, and assign it to
   // the variable.
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
index d831a8dd840..a212c25bd28 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
@@ -65,10 +65,11 @@ Status Variable::ReadValue(ImmediateTensorHandlePtr* out) {
 Status Variable::CreateUninitialized(ImmediateExecutionContext* ctx,
                                      DataType dtype, TensorShape shape,
                                      absl::optional<std::string> name,
+                                     const char* raw_device_name,
                                      std::unique_ptr<Variable>* output) {
   ImmediateTensorHandlePtr handle;
   TF_RETURN_IF_ERROR(internal::CreateUninitializedResourceVariable(
-      ctx, dtype, shape, &handle));
+      ctx, dtype, shape, raw_device_name, &handle));
 
   output->reset(
       new Variable(ctx, dtype, shape, std::move(name), std::move(handle)));
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
index 48ea1d08862..13f56fda5f3 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
@@ -37,6 +37,7 @@ class Variable : public TensorHandleConvertible {
   static Status CreateUninitialized(ImmediateExecutionContext* ctx,
                                     DataType dtype, TensorShape shape,
                                     absl::optional<std::string> name,
+                                    const char* raw_device_name,
                                     std::unique_ptr<Variable>* output);
 
   // The dtype of the underlying variable.
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_api.h b/tensorflow/c/experimental/saved_model/core/saved_model_api.h
index 5d0ed63a765..ff891e13ba4 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_api.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
@@ -39,11 +40,11 @@ class SavedModelAPI {
   virtual Status GetFunction(const std::string& function_path,
                              ConcreteFunction** function) = 0;
 
-  // Retrieve a function from a SavedModel, using the key of the
+  // Retrieve a SignatureDefFunction from a SavedModel, using the key of the
   // SignatureDef map:
   // https://github.com/tensorflow/tensorflow/blob/69b08900b1e991d84bce31f3b404f5ed768f339f/tensorflow/core/protobuf/meta_graph.proto#L89
   virtual Status GetSignatureDefFunction(const std::string& signature_def_key,
-                                         ConcreteFunction** function) = 0;
+                                         SignatureDefFunction** function) = 0;
 
   virtual std::vector<ConcreteFunction*> ListFunctions() = 0;
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
index 0d97741d7f0..e79fd8d7001 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -122,9 +122,9 @@ Status LoadSavedVariable(ImmediateExecutionContext* ctx,
   tensorflow::TensorShape shape(variable.shape());
   tensorflow::DataType dtype = variable.dtype();
 
-  TF_RETURN_IF_ERROR(
-      Variable::CreateUninitialized(ctx, dtype, shape, name, output));
-
+  TF_RETURN_IF_ERROR(Variable::CreateUninitialized(
+      ctx, dtype, shape, name,
+      variable.device().empty() ? nullptr : variable.device().c_str(), output));
   return Status();
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
index cf58e5e3536..45b0ac00c9b 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -38,9 +39,15 @@ namespace {
 class SavedVariableLoadingTest : public ::testing::TestWithParam<
                                      std::tuple<DataType, std::vector<int64>>> {
  public:
-  SavedVariableLoadingTest()
-      : device_mgr_(testing::CreateTestingDeviceMgr()),
-        ctx_(testing::CreateTestingEagerContext(device_mgr_.get())) {}
+  SavedVariableLoadingTest() {
+    SessionOptions options;
+    options.config.mutable_device_count()->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        options, "/job:localhost/replica:0/task:0", &devices));
+    device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
+    ctx_ = testing::CreateTestingEagerContext(device_mgr_.get());
+  }
 
   EagerContext* context() { return ctx_.get(); }
 
@@ -67,6 +74,39 @@ TEST_P(SavedVariableLoadingTest, LoadSavedVariableSuccessful) {
   EXPECT_EQ(var->shape(), shape);
 }
 
+// Verify that a device specified in the SavedVariable is kept.
+TEST_P(SavedVariableLoadingTest, LoadSavedVariableWithDevice) {
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+
+  SavedVariable saved_variable;
+  saved_variable.set_dtype(dtype);
+  saved_variable.set_device("/job:localhost/replica:0/task:0/device:CPU:1"),
+      shape.AsProto(saved_variable.mutable_shape());
+
+  std::unique_ptr<Variable> var;
+  TF_ASSERT_OK(internal::LoadSavedVariable(context(), saved_variable, &var));
+  EXPECT_EQ(down_cast<TensorHandle*>(var->handle())->resource_device()->name(),
+            "/job:localhost/replica:0/task:0/device:CPU:1");
+}
+
+// Verify load failure if a non-existing device is specified.
+TEST_P(SavedVariableLoadingTest, LoadSavedVariableWithInvalidDevice) {
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+
+  SavedVariable saved_variable;
+  saved_variable.set_dtype(dtype);
+  saved_variable.set_device("/job:localhost/replica:0/task:0/device:CPU:99"),
+      shape.AsProto(saved_variable.mutable_shape());
+
+  std::unique_ptr<Variable> var;
+  ASSERT_NE(Status::OK(),
+            internal::LoadSavedVariable(context(), saved_variable, &var));
+}
+
 // Assigning and reading values should yield
 // consistent results.
 TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
@@ -79,7 +119,7 @@ TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
   Status status;
   std::unique_ptr<Variable> var;
   TF_EXPECT_OK(Variable::CreateUninitialized(context(), dtype, shape,
-                                             absl::nullopt, &var));
+                                             absl::nullopt, nullptr, &var));
 
   // Create a TensorHandle
   ImmediateTensorHandlePtr expected_handle =
diff --git a/tensorflow/c/experimental/saved_model/core/signature_def_function.h b/tensorflow/c/experimental/saved_model/core/signature_def_function.h
new file mode 100644
index 00000000000..0a217f3cc21
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_def_function.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+namespace tensorflow {
+
+// See tensorflow/cc/experimental/saved_model/public/signature_def_function.h
+// for SignatureDefFunction's intended user-facing semantics.
+// This class is the "implementation" C++ part of the C++/C/C++ sandwich for
+// a SignatureDefFunction.
+// Note(bmzhao): Implementation-wise, SignatureDefFunctions are always saved as
+// a "BareConcreteFunction", w/o a FunctionSpec, rather than a SavedFunction:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/core/protobuf/saved_object_graph.proto#L60
+// Additionally they are guaranteed to be children of the .signatures attribute
+// of the root object, where the child object "name" is the signature_def key:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/python/saved_model/signature_serialization.py#L181-L230
+// One of the critical requirements of SignatureDef functions is that their
+// inputs and outputs are "named". For example, a `.signatures` function:
+// a. Requires users to pass: kwargs of all inputs:
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/python/saved_model/signature_serialization.py#L119-L126
+// b. Returns a dictionary of named outputs.
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/python/saved_model/signature_serialization.py#L153-L161
+// Since SignatureDefFunctions do not have FunctionSpecs, but guarantee the
+// dictionary of inputs/outputs, we can parse these dictionaries' keys to obtain
+// the input/output names of the SignatureDef:
+// https://github.com/tensorflow/tensorflow/blob/9bcefa44cd335c1db4a703a13da09f29ae1bbdb2/tensorflow/core/protobuf/meta_graph.proto#L318-L321
+class SignatureDefFunction {
+ public:
+  virtual ~SignatureDefFunction() = default;
+
+  // Creates a "Call" Op used to execute the function.
+  virtual Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                            ImmediateOpPtr* out) const = 0;
+
+  virtual const SignatureDefFunctionMetadata& GetFunctionMetadata() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
new file mode 100644
index 00000000000..5a579676d4e
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+namespace tensorflow {
+
+class SignatureDefFunctionMetadata {
+  // TODO(bmzhao): Fill in with fields as necessary
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.cc b/tensorflow/c/experimental/saved_model/core/test_utils.cc
index b803d129b90..d551919ea94 100644
--- a/tensorflow/c/experimental/saved_model/core/test_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
index 0f0102be857..ab7052b52ed 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -305,7 +306,7 @@ Status TFSavedModelAPI::GetFunction(const std::string& function_path,
 }
 
 Status TFSavedModelAPI::GetSignatureDefFunction(
-    const std::string& signature_def_key, ConcreteFunction** function) {
+    const std::string& signature_def_key, SignatureDefFunction** function) {
   // TODO(bmzhao): Add support for retrieving a signaturedef function.
   return errors::Unimplemented(
       "Retrieving SignatureDef functions is unimplemented currently");
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
index fc8e738e86f..fd07c09474b 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -55,7 +56,7 @@ class TFSavedModelAPI : public SavedModelAPI {
                      ConcreteFunction** function) override;
 
   Status GetSignatureDefFunction(const std::string& signature_def_key,
-                                 ConcreteFunction** function) override;
+                                 SignatureDefFunction** function) override;
 
   static Status Load(
       const std::string& directory,
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 323298c5fc1..c0d121a4aee 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -142,6 +142,8 @@ cc_library(
         ":concrete_function_list_type",
         ":concrete_function_type",
         ":saved_model_api_type",
+        ":signature_def_function",
+        ":signature_def_function_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_internal",
@@ -165,6 +167,77 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "signature_def_function",
+    srcs = [
+        "signature_def_function.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":signature_def_function_metadata",
+        ":signature_def_function_metadata_type",
+        ":signature_def_function_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_type",
+    hdrs = [
+        "signature_def_function_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata",
+    srcs = [
+        "signature_def_function_metadata.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function_metadata.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":signature_def_function_metadata_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata_type",
+    hdrs = [
+        "signature_def_function_metadata_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
index 983c98affb2..b89fb9f6d64 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_list_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -106,9 +107,11 @@ TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(TF_SavedModel* model,
   return tensorflow::wrap(result);
 }
 
-TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
-    TF_SavedModel* model, const char* signature_def_key, TF_Status* status) {
-  tensorflow::ConcreteFunction* result = nullptr;
+TF_CAPI_EXPORT extern TF_SignatureDefFunction*
+TF_GetSavedModelSignatureDefFunction(TF_SavedModel* model,
+                                     const char* signature_def_key,
+                                     TF_Status* status) {
+  tensorflow::SignatureDefFunction* result = nullptr;
   tensorflow::Status get_function_status =
       tensorflow::unwrap(model)->GetSignatureDefFunction(signature_def_key,
                                                          &result);
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_function.cc
new file mode 100644
index 00000000000..64f7506f32e
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function.cc
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/core/platform/status.h"
+
+extern "C" {
+
+TF_SignatureDefFunctionMetadata* TF_SignatureDefFunctionGetMetadata(
+    TF_SignatureDefFunction* func) {
+  return tensorflow::wrap(const_cast<tensorflow::SignatureDefFunctionMetadata*>(
+      &tensorflow::unwrap(func)->GetFunctionMetadata()));
+}
+
+TFE_Op* TF_SignatureDefFunctionMakeCallOp(TF_SignatureDefFunction* func,
+                                          TFE_TensorHandle** inputs,
+                                          int num_inputs, TF_Status* status) {
+  tensorflow::ImmediateOpPtr call_op;
+  absl::Span<tensorflow::AbstractTensorHandle* const> input_span(
+      reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+          tensorflow::unwrap(inputs)),
+      static_cast<size_t>(num_inputs));
+  status->status = tensorflow::unwrap(func)->MakeCallOp(input_span, &call_op);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return tensorflow::wrap(call_op.release());
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
new file mode 100644
index 00000000000..c5c3616211c
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
@@ -0,0 +1,20 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h"
+
+// TODO(bmzhao): Add getter functions here as necessary.
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h
new file mode 100644
index 00000000000..fa6d0f6541e
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefFunctionMetadata,
+                            TF_SignatureDefFunctionMetadata)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_METADATA_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h b/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h
new file mode 100644
index 00000000000..ca44dc43bd6
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+
+typedef struct TF_SignatureDefFunction TF_SignatureDefFunction;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefFunction,
+                            TF_SignatureDefFunction)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_FUNCTION_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index af65e05e7f6..d29585ae1ba 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -24,6 +24,8 @@ exports_files(
         "concrete_function_list.h",
         "function_metadata.h",
         "saved_model_api.h",
+        "signature_def_function.h",
+        "signature_def_function_metadata.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -39,6 +41,8 @@ cc_library(
         ":concrete_function_list",
         ":function_metadata",
         ":saved_model_api",
+        ":signature_def_function",
+        ":signature_def_function_metadata",
     ],
 )
 
@@ -61,3 +65,13 @@ alias(
     name = "saved_model_api",
     actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
 )
+
+alias(
+    name = "signature_def_function",
+    actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_function",
+)
+
+alias(
+    name = "signature_def_function_metadata",
+    actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_function_metadata",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index 30f533f140a..cedb9de66b8 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index ee5292294d6..0fd0f70cf16 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -40,6 +40,13 @@ TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
 // The caller is responsible for deleting the returned TFE_Op. If op
 // construction fails, `status` will be non-OK and the returned pointer will be
 // null.
+// TODO(bmzhao): Remove this function in a subsequent change; Design + implement
+// a Function Execution interface for ConcreteFunction that accepts a tagged
+// union of types (tensorflow::Value). This effectively requires moving much of
+// the implementation of function.py/def_function.py to C++, and exposing a
+// high-level API here. A strawman for what this interface could look like:
+// TF_Value* TF_ExecuteFunction(TFE_Context*, TF_ConcreteFunction*, TF_Value*
+// inputs, int num_inputs, TF_Status* status);
 TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionGetCallOp(
     TF_ConcreteFunction* func, TFE_TensorHandle** inputs, int num_inputs,
     TF_Status* status);
diff --git a/tensorflow/c/experimental/saved_model/public/saved_model_api.h b/tensorflow/c/experimental/saved_model/public/saved_model_api.h
index 875167bec63..80ba37bab26 100644
--- a/tensorflow/c/experimental/saved_model/public/saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/saved_model_api.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
 #include "tensorflow/c/tf_status.h"
 
 #ifdef __cplusplus
@@ -91,10 +92,13 @@ TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(
 //  status - Set to OK on success and an appropriate error on failure.
 // Returns:
 //  If status is not OK, returns nullptr. Otherwise, returns a
-//  TF_ConcreteFunction instance. Once `model` is deleted, all
-//  `TF_ConcreteFunctions` retrieved from it are invalid, and have been deleted.
-TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
-    TF_SavedModel* model, const char* signature_def_key, TF_Status* status);
+//  TF_SignatureDefFunction instance. Once `model` is deleted, all
+//  `TF_SignatureDefFunctions` retrieved from it are invalid, and have been
+//  deleted.
+TF_CAPI_EXPORT extern TF_SignatureDefFunction*
+TF_GetSavedModelSignatureDefFunction(TF_SavedModel* model,
+                                     const char* signature_def_key,
+                                     TF_Status* status);
 
 // Returns a list of all ConcreteFunctions stored in this SavedModel.
 // The lifetime of the returned list is bound to `model`.
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_function.h b/tensorflow/c/experimental/saved_model/public/signature_def_function.h
new file mode 100644
index 00000000000..16471fdc1fa
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_function.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a SignatureDefFunction loaded from a
+// SavedModel.
+typedef struct TF_SignatureDefFunction TF_SignatureDefFunction;
+
+// Returns FunctionMetadata associated with `func`. Metadata's lifetime is
+// bound to `func`, which is bound to the TF_SavedModel it was loaded from.
+TF_CAPI_EXPORT extern TF_SignatureDefFunctionMetadata*
+TF_SignatureDefFunctionGetMetadata(TF_SignatureDefFunction* func);
+
+// Returns a TFE_Op suitable for executing this function. Caller must provide
+// all function inputs in `inputs`, and must not add any additional inputs on
+// the returned op. (i.e. don't call TFE_OpAddInput or TFE_OpAddInputList).
+// The caller is responsible for deleting the returned TFE_Op. If op
+// construction fails, `status` will be non-OK and the returned pointer will be
+// null.
+TF_CAPI_EXPORT extern TFE_Op* TF_SignatureDefFunctionMakeCallOp(
+    TF_SignatureDefFunction* func, TFE_TensorHandle** inputs, int num_inputs,
+    TF_Status* status);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h b/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
new file mode 100644
index 00000000000..6f4459732c4
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that corresponds to a SignatureDefFunction loaded from a
+// SavedModel.
+typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
new file mode 100644
index 00000000000..7daa311d461
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -0,0 +1,60 @@
+# Description:
+# StreamExecutor C API.
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "stream_executor",
+    srcs = ["stream_executor.cc"],
+    hdrs = ["stream_executor.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stream_executor_internal",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
+        "//tensorflow/stream_executor:timer",
+    ],
+)
+
+cc_library(
+    name = "stream_executor_internal",
+    hdrs = [
+        "stream_executor.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+tf_cc_test(
+    name = "stream_executor_test",
+    srcs = ["stream_executor_test.cc"],
+    deps = [
+        ":stream_executor",
+        ":stream_executor_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
+    ],
+)
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
new file mode 100644
index 00000000000..0e55ba3d72a
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -0,0 +1,809 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file extends/implements core stream executor base classes in terms of
+// the C API defined in stream_executor.h. A class "CSomething" represents a
+// "Something" that can be manipulated via calls in the C interface and a C
+// struct called "SP_Something".
+//
+// This file also contains stream_executor::Platform registration for pluggable
+// device.
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+
+#include <string>
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+using tensorflow::StatusFromTF_Status;
+
+namespace stream_executor {
+namespace {
+
+#define VALIDATE_STRUCT_SIZE(STRUCT_NAME, STRUCT_OBJ, SIZE_VALUE_NAME) \
+  do {                                                                 \
+    if (STRUCT_OBJ.struct_size == 0) {                                 \
+      return port::FailedPreconditionError(                            \
+          "struct_size field in " #STRUCT_NAME                         \
+          " must be set to " #SIZE_VALUE_NAME ".");                    \
+    }                                                                  \
+  } while (0)
+
+#define VALIDATE_MEMBER(STRUCT_NAME, STRUCT_OBJ, NAME)           \
+  do {                                                           \
+    if (STRUCT_OBJ.NAME == 0) {                                  \
+      return port::FailedPreconditionError(                      \
+          "'" #NAME "' field in " #STRUCT_NAME " must be set."); \
+    }                                                            \
+  } while (0)
+
+port::Status ValidateSPPlatform(const SP_Platform& platform) {
+  VALIDATE_STRUCT_SIZE(SP_Platform, platform, SP_PLATFORM_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_Platform, platform, name);
+  VALIDATE_MEMBER(SP_Platform, platform, type);
+  VALIDATE_MEMBER(SP_Platform, platform, visible_device_count);
+  VALIDATE_MEMBER(SP_Platform, platform, create_device);
+  VALIDATE_MEMBER(SP_Platform, platform, destroy_device);
+  VALIDATE_MEMBER(SP_Platform, platform, create_stream_executor);
+  VALIDATE_MEMBER(SP_Platform, platform, destroy_stream_executor);
+  VALIDATE_MEMBER(SP_Platform, platform, create_timer_fns);
+  VALIDATE_MEMBER(SP_Platform, platform, destroy_timer_fns);
+  return port::Status::OK();
+}
+
+port::Status ValidateSPTimerFns(const SP_TimerFns& timer_fns) {
+  VALIDATE_STRUCT_SIZE(SP_TimerFns, timer_fns, SP_TIMER_FNS_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_TimerFns, timer_fns, nanoseconds);
+  return port::Status::OK();
+}
+
+port::Status ValidateSPAllocatorStats(const SP_AllocatorStats& stats) {
+  VALIDATE_STRUCT_SIZE(SP_AllocatorStats, stats, SP_ALLOCATORSTATS_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPDeviceMemoryBase(const SP_DeviceMemoryBase& mem) {
+  VALIDATE_STRUCT_SIZE(SP_DeviceMemoryBase, mem,
+                       SP_DEVICE_MEMORY_BASE_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPDevice(const SP_Device& device) {
+  VALIDATE_STRUCT_SIZE(SP_Device, device, SP_DEVICE_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se) {
+  VALIDATE_STRUCT_SIZE(SP_StreamExecutor, se, SP_STREAM_EXECUTOR_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, allocate);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, deallocate);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, get_allocator_stats);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, device_memory_usage);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_stream);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_stream);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_stream_dependency);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, get_stream_status);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, get_event_status);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, record_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, wait_for_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, create_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, start_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, stop_timer);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, memcpy_dtoh);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, memcpy_htod);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, sync_memcpy_dtoh);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, sync_memcpy_htod);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, block_host_for_event);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, synchronize_all_activity);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, host_callback);
+  return port::Status::OK();
+}
+
+port::Status ValidateSEPlatformRegistrationParams(
+    const SE_PlatformRegistrationParams& params) {
+  VALIDATE_STRUCT_SIZE(SE_PlatformRegistrationParams, params,
+                       SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE);
+  VALIDATE_MEMBER(SE_PlatformRegistrationParams, params, destroy_platform);
+  return port::Status::OK();
+}
+
+#undef VALIDATE_MEMBER
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* s) const { TF_DeleteStatus(s); }
+};
+using OwnedTFStatus = std::unique_ptr<TF_Status, TFStatusDeleter>;
+
+class CStream : public internal::StreamInterface {
+ public:
+  CStream(SP_Device* device, SP_StreamExecutor* stream_executor)
+      : device_(device),
+        stream_executor_(stream_executor),
+        stream_handle_(nullptr) {}
+  ~CStream() override { Destroy(); }
+
+  port::Status Create() {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->create_stream(device_, &stream_handle_, c_status.get());
+    port::Status s = StatusFromTF_Status(c_status.get());
+    return s;
+  }
+
+  void Destroy() {
+    if (stream_handle_ != nullptr) {
+      stream_executor_->destroy_stream(device_, stream_handle_);
+      stream_handle_ = nullptr;
+    }
+  }
+
+  SP_Stream Handle() { return stream_handle_; }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Stream stream_handle_;
+};
+
+// Converts SE_EventStatus to Event::Status.
+Event::Status SEEventStatusToEventStatus(SE_EventStatus s) {
+  switch (s) {
+    case SE_EVENT_ERROR:
+      return Event::Status::kError;
+    case SE_EVENT_PENDING:
+      return Event::Status::kPending;
+    case SE_EVENT_COMPLETE:
+      return Event::Status::kComplete;
+    default:
+      return Event::Status::kUnknown;
+  }
+}
+
+class CEvent : public internal::EventInterface {
+ public:
+  CEvent(SP_Device* device, SP_StreamExecutor* stream_executor)
+      : device_(device),
+        stream_executor_(stream_executor),
+        event_handle_(nullptr) {}
+  ~CEvent() override { Destroy(); }
+
+  port::Status Create() {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->create_event(device_, &event_handle_, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  port::Status Record(SP_Stream stream_handle) {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->record_event(device_, stream_handle, event_handle_,
+                                   c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  void Destroy() {
+    if (event_handle_ != nullptr) {
+      stream_executor_->destroy_event(device_, event_handle_);
+      event_handle_ = nullptr;
+    }
+  }
+
+  SP_Event Handle() { return event_handle_; }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Event event_handle_;
+};
+
+class CTimer : public internal::TimerInterface {
+ public:
+  CTimer(SP_Device* device, SP_StreamExecutor* stream_executor,
+         SP_TimerFns* timer_fns)
+      : device_(device),
+        stream_executor_(stream_executor),
+        timer_handle_(nullptr),
+        timer_fns_(timer_fns) {}
+  ~CTimer() override { Destroy(); }
+
+  port::Status Create() {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->create_timer(device_, &timer_handle_, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  void Destroy() {
+    if (timer_handle_ != nullptr) {
+      stream_executor_->destroy_timer(device_, timer_handle_);
+      timer_handle_ = nullptr;
+    }
+  }
+
+  SP_Timer Handle() { return timer_handle_; }
+
+  uint64 Microseconds() const override {
+    return timer_fns_->nanoseconds(timer_handle_) / 1000;
+  }
+
+  uint64 Nanoseconds() const override {
+    return timer_fns_->nanoseconds(timer_handle_);
+  }
+
+ private:
+  SP_Device* device_;
+  SP_StreamExecutor* stream_executor_;
+  SP_Timer timer_handle_;
+  SP_TimerFns* timer_fns_;
+};
+
+// Converts DeviceMemoryBase to a C struct.
+SP_DeviceMemoryBase DeviceMemoryBaseToC(const DeviceMemoryBase* mem) {
+  SP_DeviceMemoryBase device_memory_base{SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
+  // `opaque` field inside SP_DeviceMemoryBase is not const.
+  // Therefore, we need to cast away the constness before setting it.
+  device_memory_base.opaque = const_cast<void*>(mem->opaque());
+  device_memory_base.size = mem->size();
+  device_memory_base.payload = mem->payload();
+  // TODO(annarev): Add `ext` field to DeviceMemoryBase and set it here.
+  return device_memory_base;
+}
+
+DeviceMemoryBase DeviceMemoryBaseFromC(const SP_DeviceMemoryBase& mem) {
+  DeviceMemoryBase base(mem.opaque, mem.size);
+  base.SetPayload(mem.payload);
+  // TODO(annarev): Add `ext` field to DeviceMemoryBase and set it here.
+  return base;
+}
+
+// Wrapper that allows passing std::function across C API.
+struct HostCallbackContext {
+  std::function<port::Status()> callback;
+};
+
+// This wrapper allows calling `HostCallbackContext::callback` across C API.
+// This function matches `SE_StatusCallbackFn` signature and will be passed as
+// `callback_fn` to `host_callback` in `SP_StreamExecutor`.
+void HostCallbackTrampoline(void* ctx, TF_Status* status) {
+  HostCallbackContext* host_ctx = static_cast<HostCallbackContext*>(ctx);
+  port::Status s = host_ctx->callback();
+  Set_TF_Status_from_Status(status, s);
+  delete host_ctx;
+}
+
+class CStreamExecutor : public internal::StreamExecutorInterface {
+ public:
+  explicit CStreamExecutor(SP_Device device,
+                           void (*destroy_device)(SP_Device* const device),
+                           SP_StreamExecutor* stream_executor,
+                           SP_TimerFns* timer_fns, const std::string& name,
+                           int visible_device_count)
+      : device_(std::move(device)),
+        destroy_device_(destroy_device),
+        stream_executor_(stream_executor),
+        timer_fns_(timer_fns),
+        platform_name_(name),
+        visible_device_count_(visible_device_count) {}
+
+  ~CStreamExecutor() override { destroy_device_(&device_); }
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    return port::Status::OK();
+  }
+
+  DeviceMemoryBase Allocate(uint64 size, int64 memory_space) override {
+    SP_DeviceMemoryBase mem = {SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
+    stream_executor_->allocate(&device_, size, memory_space, &mem);
+    port::Status status = ValidateSPDeviceMemoryBase(mem);
+    if (!status.ok()) {
+      LOG(ERROR) << status.error_message();
+    }
+    return DeviceMemoryBaseFromC(mem);
+  }
+  DeviceMemoryBase Allocate(uint64 size) {
+    return Allocate(size, /*memory_space=*/0);
+  }
+  void* GetSubBuffer(DeviceMemoryBase* parent, uint64 offset,
+                     uint64 size) override {
+    LOG(FATAL) << "GetSubBuffer is not supported by pluggable device.";
+  }
+
+  void Deallocate(DeviceMemoryBase* mem) override {
+    SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(mem);
+    stream_executor_->deallocate(&device_, &device_memory_base);
+  }
+
+  void* HostMemoryAllocate(uint64 size) override {
+    return stream_executor_->host_memory_allocate(&device_, size);
+  }
+
+  void HostMemoryDeallocate(void* mem) override {
+    stream_executor_->host_memory_deallocate(&device_, mem);
+  }
+
+  bool HostMemoryRegister(void* mem, uint64 size) override { return false; }
+  bool HostMemoryUnregister(void* mem) override { return false; }
+
+  absl::optional<AllocatorStats> GetAllocatorStats() override {
+    SP_AllocatorStats c_stats{SP_ALLOCATORSTATS_STRUCT_SIZE};
+    TF_Bool has_stats =
+        stream_executor_->get_allocator_stats(&device_, &c_stats);
+    if (!has_stats) {
+      return absl::nullopt;
+    }
+    port::Status status = ValidateSPAllocatorStats(c_stats);
+    if (!status.ok()) {
+      LOG(ERROR) << status.error_message();
+      return absl::nullopt;
+    }
+    // TODO(annarev): validate SP_AllocatorStats.
+    ::stream_executor::AllocatorStats stats;
+    stats.num_allocs = c_stats.num_allocs;
+    stats.bytes_in_use = c_stats.bytes_in_use;
+    stats.peak_bytes_in_use = c_stats.peak_bytes_in_use;
+    stats.largest_alloc_size = c_stats.largest_alloc_size;
+    if (c_stats.has_bytes_limit) {
+      stats.bytes_limit = c_stats.bytes_limit;
+    }
+    stats.bytes_reserved = c_stats.bytes_reserved;
+    stats.peak_bytes_reserved = c_stats.peak_bytes_reserved;
+    if (c_stats.has_bytes_reservable_limit) {
+      stats.bytes_reservable_limit = c_stats.bytes_reservable_limit;
+    }
+    stats.largest_free_block_bytes = c_stats.largest_free_block_bytes;
+    return stats;
+  }
+  bool SynchronizeAllActivity() override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->synchronize_all_activity(&device_, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  port::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                  uint64 size) override {
+    // TODO(annarev): figure out if we should support memzero/memset
+    // functionality by allocating on host and then copying to device.
+    return port::UnimplementedError(
+        "SynchronousMemZero is not supported by pluggable device.");
+  }
+  port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                 uint64 size) override {
+    return port::UnimplementedError(
+        "SynchronousMemSet is not supported by pluggable device.");
+  }
+  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(gpu_dst);
+    stream_executor_->sync_memcpy_htod(&device_, &device_memory_base, host_src,
+                                       size, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  port::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->sync_memcpy_dtoh(&device_, host_dst, &device_memory_base,
+                                       size, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                               const DeviceMemoryBase& gpu_src,
+                                               uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->sync_memcpy_dtod(&device_, &device_mem_dst,
+                                       &device_mem_src, size, c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                       uint64 size) override {
+    return port::UnimplementedError(
+        "MemZero is not supported by pluggable device.");
+  }
+  port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+                      uint64 size) override {
+    return port::UnimplementedError(
+        "Memset is not supported by pluggable device.");
+  }
+  port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                        uint32 pattern, uint64 size) override {
+    return port::UnimplementedError(
+        "Memset32 is not supported by pluggable device.");
+  }
+  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
+              uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->memcpy_dtoh(&device_, stream_handle, host_dst,
+                                  &device_mem_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
+              uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    stream_executor_->memcpy_htod(&device_, stream_handle, &device_mem_dst,
+                                  host_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& gpu_src,
+                            uint64 size) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
+    SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
+    stream_executor_->memcpy_dtod(&device_, stream_handle, &device_mem_dst,
+                                  &device_mem_src, size, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool HostCallback(Stream* stream,
+                    std::function<port::Status()> callback) override {
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    HostCallbackContext* ctx = new HostCallbackContext{callback};
+    return stream_executor_->host_callback(&device_, stream_handle,
+                                           &HostCallbackTrampoline, ctx);
+  }
+  port::Status AllocateEvent(Event* event) override {
+    DCHECK(event != nullptr);
+    return static_cast<CEvent*>(event->implementation())->Create();
+  }
+  port::Status DeallocateEvent(Event* event) override {
+    static_cast<CEvent*>(event->implementation())->Destroy();
+    return port::Status::OK();
+  }
+  port::Status RecordEvent(Stream* stream, Event* event) override {
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    return static_cast<CEvent*>(event->implementation())->Record(stream_handle);
+  }
+  port::Status WaitForEvent(Stream* stream, Event* event) override {
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Event event_handle =
+        static_cast<CEvent*>(event->implementation())->Handle();
+    OwnedTFStatus c_status(TF_NewStatus());
+    stream_executor_->wait_for_event(&device_, stream_handle, event_handle,
+                                     c_status.get());
+    port::Status s = StatusFromTF_Status(c_status.get());
+    return s;
+  }
+  Event::Status PollForEventStatus(Event* event) override {
+    SP_Event event_handle =
+        static_cast<CEvent*>(event->implementation())->Handle();
+    SE_EventStatus event_status =
+        stream_executor_->get_event_status(&device_, event_handle);
+    return SEEventStatusToEventStatus(event_status);
+  }
+  bool AllocateStream(Stream* stream) override {
+    DCHECK(stream != nullptr);
+    port::Status status =
+        static_cast<CStream*>(stream->implementation())->Create();
+    // TODO(annarev): update AllocateStream to return status instead
+    // (similar to AllocateEvent).
+    return status.ok();
+  }
+  void DeallocateStream(Stream* stream) override {
+    static_cast<CStream*>(stream->implementation())->Destroy();
+  }
+  bool CreateStreamDependency(Stream* dependent, Stream* other) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream dependent_handle =
+        static_cast<CStream*>(dependent->implementation())->Handle();
+    SP_Stream other_handle =
+        static_cast<CStream*>(other->implementation())->Handle();
+    stream_executor_->create_stream_dependency(&device_, dependent_handle,
+                                               other_handle, c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool AllocateTimer(Timer* timer) override {
+    port::Status status =
+        static_cast<CTimer*>(timer->implementation())->Create();
+    // TODO(annarev): change return value of AllocateTimer
+    // to status (similar to AllocateEvent).
+    return status.ok();
+  }
+  void DeallocateTimer(Timer* timer) override {
+    static_cast<CTimer*>(timer->implementation())->Destroy();
+  }
+  bool StartTimer(Stream* stream, Timer* timer) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Timer timer_handle =
+        static_cast<CTimer*>(timer->implementation())->Handle();
+    stream_executor_->start_timer(&device_, stream_handle, timer_handle,
+                                  c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  bool StopTimer(Stream* stream, Timer* timer) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    SP_Timer timer_handle =
+        static_cast<CTimer*>(timer->implementation())->Handle();
+    stream_executor_->stop_timer(&device_, stream_handle, timer_handle,
+                                 c_status.get());
+    if (TF_GetCode(c_status.get()) != TF_OK) {
+      LOG(ERROR) << TF_Message(c_status.get());
+      return false;
+    }
+    return true;
+  }
+  port::Status BlockHostForEvent(Stream* stream, Event* event) {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Event event_handle =
+        static_cast<CEvent*>(event->implementation())->Handle();
+    stream_executor_->block_host_for_event(&device_, event_handle,
+                                           c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  port::Status BlockHostUntilDone(Stream* stream) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Event event_handle;
+    stream_executor_->create_event(&device_, &event_handle, c_status.get());
+    TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    stream_executor_->record_event(&device_, stream_handle, event_handle,
+                                   c_status.get());
+    port::Status s = StatusFromTF_Status(c_status.get());
+    if (!s.ok()) {
+      stream_executor_->destroy_event(&device_, event_handle);
+      return s;
+    }
+    stream_executor_->block_host_for_event(&device_, event_handle,
+                                           c_status.get());
+    stream_executor_->destroy_event(&device_, event_handle);
+    return StatusFromTF_Status(c_status.get());
+  }
+
+  port::Status GetStatus(Stream* stream) override {
+    OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+    stream_executor_->get_stream_status(&device_, stream_handle,
+                                        c_status.get());
+    return StatusFromTF_Status(c_status.get());
+  }
+  int PlatformDeviceCount() override { return visible_device_count_; }
+  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+    return port::UnimplementedError(
+        "EnablePeerAccessTo is not supported by pluggable device.");
+  }
+  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
+    return false;
+  }
+
+  bool DeviceMemoryUsage(int64* free, int64* total) const override {
+    static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                  "64-bit int types should match in size");
+    return stream_executor_->device_memory_usage(
+        &device_, reinterpret_cast<int64_t*>(free),
+        reinterpret_cast<int64_t*>(total));
+  }
+
+  // Creates a new DeviceDescription object.
+  // Ownership is transferred to the caller.
+  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+      const override {
+    // TODO(annarev): Figure out if we need to support more description fields.
+    internal::DeviceDescriptionBuilder builder;
+    builder.set_name(platform_name_);
+    return builder.Build();
+  }
+
+  // Each call creates a new instance of the platform-specific implementation of
+  // the corresponding interface type.
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override {
+    return std::unique_ptr<internal::EventInterface>(
+        new CEvent(&device_, stream_executor_));
+  }
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override {
+    LOG(FATAL)
+        << "CreateKernelImplementation is not supported by pluggable device.";
+  }
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
+      override {
+    return std::unique_ptr<internal::StreamInterface>(
+        new CStream(&device_, stream_executor_));
+  }
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override {
+    return std::unique_ptr<internal::TimerInterface>(
+        new CTimer(&device_, stream_executor_, timer_fns_));
+  }
+
+ private:
+  SP_Device device_;
+  void (*destroy_device_)(SP_Device* const device);
+  SP_StreamExecutor* stream_executor_;
+  SP_TimerFns* timer_fns_;
+  std::string platform_name_;
+  int visible_device_count_;
+};
+}  // namespace
+
+CPlatform::CPlatform(SP_Platform platform,
+                     void (*destroy_platform)(SP_Platform*),
+                     SP_StreamExecutor stream_executor, SP_TimerFns timer_fns)
+    : platform_(std::move(platform)),
+      destroy_platform_(destroy_platform),
+      stream_executor_(std::move(stream_executor)),
+      timer_fns_(std::move(timer_fns)),
+      name_(platform.name) {}
+
+CPlatform::~CPlatform() {
+  executor_cache_.DestroyAllExecutors();
+  platform_.destroy_stream_executor(&stream_executor_);
+  platform_.destroy_timer_fns(&timer_fns_);
+  destroy_platform_(&platform_);
+}
+
+port::StatusOr<std::unique_ptr<DeviceDescription>>
+CPlatform::DescriptionForDevice(int ordinal) const {
+  // TODO(annarev): see if we can get StreamExecutor instance
+  // and call GetDeviceDescription. executor_cache_.Get would need
+  // to be made const for it to work.
+  internal::DeviceDescriptionBuilder builder;
+  builder.set_name(name_);
+  return builder.Build();
+}
+port::StatusOr<StreamExecutor*> CPlatform::ExecutorForDevice(int ordinal) {
+  stream_executor::StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  return GetExecutor(config);
+}
+port::StatusOr<StreamExecutor*> CPlatform::ExecutorForDeviceWithPluginConfig(
+    int ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = plugin_config;
+  return GetExecutor(config);
+}
+port::StatusOr<StreamExecutor*> CPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  return executor_cache_.GetOrCreate(
+      config, [&]() { return GetUncachedExecutor(config); });
+}
+port::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
+    const StreamExecutorConfig& config) {
+  // Fill device creation params
+  SE_CreateDeviceParams device_params{SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE};
+  SP_Device device{SP_DEVICE_STRUCT_SIZE};
+  device_params.device = &device;
+  device_params.ext = nullptr;
+  device_params.ordinal = config.ordinal;
+  OwnedTFStatus c_status(TF_NewStatus());
+
+  // Create Device
+  platform_.create_device(&device_params, c_status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPDevice(device));
+
+  auto executor = absl::make_unique<CStreamExecutor>(
+      std::move(device), platform_.destroy_device, &stream_executor_,
+      &timer_fns_, name_, platform_.visible_device_count);
+  auto result = absl::make_unique<StreamExecutor>(this, std::move(executor),
+                                                  config.ordinal);
+  return result;
+}
+
+port::Status RegisterDevicePlugin(const std::string& dso_path) {
+  // Step 1: Load plugin
+  tensorflow::Env* env = tensorflow::Env::Default();
+  void* dso_handle;
+  TF_RETURN_IF_ERROR(env->LoadDynamicLibrary(dso_path.c_str(), &dso_handle));
+
+  // Step 2: Load symbol for `TF_InitPlugin`
+  void* dso_symbol;
+  TF_RETURN_IF_ERROR(
+      env->GetSymbolFromLibrary(dso_handle, "SE_InitPlugin", &dso_symbol));
+
+  // Step 3: Call `TF_InitPlugin`
+  auto init_fn = reinterpret_cast<SEPluginInitFn>(dso_symbol);
+  return RegisterDevicePlugin(init_fn);
+}
+
+port::Status RegisterDevicePlugin(SEPluginInitFn init_fn) {
+  SE_PlatformRegistrationParams params{
+      SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE};
+  SP_Platform platform{SP_PLATFORM_STRUCT_SIZE};
+  params.major_version = SE_MAJOR;
+  params.minor_version = SE_MINOR;
+  params.revision_version = SE_REVISION;
+  params.platform = &platform;
+
+  OwnedTFStatus c_status(TF_NewStatus());
+  init_fn(&params, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSEPlatformRegistrationParams(params));
+  TF_RETURN_IF_ERROR(ValidateSPPlatform(platform));
+
+  // Fill stream executor creation params
+  SE_CreateStreamExecutorParams se_params{
+      SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE};
+  SP_StreamExecutor se{SP_STREAMEXECUTOR_STRUCT_SIZE};
+  se_params.stream_executor = &se;
+
+  // Create StreamExecutor
+  platform.create_stream_executor(&se_params, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPStreamExecutor(se));
+
+  SP_TimerFns timer_fns{SP_TIMER_FNS_STRUCT_SIZE};
+  platform.create_timer_fns(&timer_fns, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPTimerFns(timer_fns));
+
+  // Register new platform
+  std::string platform_name = std::string(platform.name);
+  std::unique_ptr<stream_executor::CPlatform> cplatform(
+      new stream_executor::CPlatform(std::move(platform),
+                                     params.destroy_platform, std::move(se),
+                                     std::move(timer_fns)));
+  SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+      std::move(cplatform)));
+
+  // TODO(annarev): Add pluggable device registration here.
+  return port::Status::OK();
+}
+}  // namespace stream_executor
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.h b/tensorflow/c/experimental/stream_executor/stream_executor.h
new file mode 100644
index 00000000000..b3459a29ccc
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.h
@@ -0,0 +1,395 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_status.h"
+
+// --------------------------------------------------------------------------
+// C API for StreamExecutor. The API is under active development and eventually
+// should allow registering a pluggable device with TensorFlow.
+//
+// Conventions:
+//   * Struct prefix indicates whether struct fields should be filled by the
+//     plugin or core implementation:
+//     * SE_ : set/filled by core unless explicitly marked otherwise.
+//     * SP_ : set/filled by plugin unless explicitly marked otherwise.
+//   * We use `struct_size` for version checking. It is exempt from the `SE/SP`
+//     rule above and should be set both by core and the plugin.
+//     * For example, `create_device` function receives `SP_Device*` as input
+//       with `struct_size` populated by core. The plugin is responsible for
+//       setting `struct_size` as well, along with all other fields.
+//     * Refer to "TensorFlow Versioning Strategy" section at
+//       https://github.com/tensorflow/community/pull/257/files.
+//     * Note that the API is still under active development and doesn't have
+//       versioning guarantees yet.
+//   * `void* ext` is a free-form field that can be populated by
+//     a plugin in `SP_*` structs or potential future extension points in `SE_`
+//     structs.
+//
+// Example usage:
+//
+//   /* Sample TensorFlow code below, exact implementation might differ. */
+//   // Version checking uses `struct_size`. It is exempt from the `SE/SP` rule
+//   // above and should be set both by core and the plugin."
+//   SP_Device device { SP_DEVICE_STRUCT_SIZE };
+//   SE_CreateDeviceParams params { SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE } ;
+//   params.device = &device;
+//
+//   /* Plugin code below */
+//   constexpr char DEVICE_NAME[] = "MyDevice";
+//   constexpr char DEVICE_TYPE[] = "GPU";
+//
+//   void create_device(SE_CreateDeviceParams* params, TF_Status* status) {
+//     // Custom actions based on TensorFlow's view of SP_Device.
+//     OnTFDeviceView(params->device->struct_size);
+//     params->device = { SP_DEVICE_STRUCT_SIZE };
+//     params->device->device_handle = get_my_device_handle(device->ordinal);
+//     params->device->ordinal = params->ordinal;
+//     ...
+//   }
+//
+//   void destroy_device(SP_Device* device) {
+//     delete_my_device_handle(device->device_handle);
+//   }
+//
+//   void SE_InitPlugin(
+//       SE_PlatformRegistrationParams* params,
+//       TF_Status* status) {
+//     params->platform = { SP_PLATFORM_STRUCT_SIZE };
+//     // Values such as `name` and `type` must outlive SE_InitPlugin call.
+//     params->platform->name = DEVICE_NAME;
+//     params->platform->type = DEVICE_TYPE;
+//     params->platform->visible_device_count = 2;
+//     params->platform->create_device = create_device;
+//     params->platform->destroy_device = destroy_device;
+//     ...
+//   }
+
+#define SE_MAJOR 0
+#define SE_MINOR 0
+#define SE_REVISION 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct SP_Stream_st* SP_Stream;
+typedef struct SP_Event_st* SP_Event;
+typedef struct SP_Timer_st* SP_Timer;
+// Takes `callback_arg` passed to `host_callback` as the first argument.
+typedef void (*SE_StatusCallbackFn)(void* const, TF_Status* const);
+
+typedef struct SP_TimerFns {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+  uint64_t (*nanoseconds)(SP_Timer timer);
+} SP_TimerFns;
+
+#define SP_TIMER_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_TimerFns, nanoseconds)
+
+typedef struct SP_AllocatorStats {
+  size_t struct_size;
+  int64_t num_allocs;
+  int64_t bytes_in_use;
+  int64_t peak_bytes_in_use;
+  int64_t largest_alloc_size;
+
+  int8_t has_bytes_limit;
+  int64_t bytes_limit;
+
+  int64_t bytes_reserved;
+  int64_t peak_bytes_reserved;
+
+  int8_t has_bytes_reservable_limit;
+  int64_t bytes_reservable_limit;
+
+  int64_t largest_free_block_bytes;
+} SP_AllocatorStats;
+
+#define SP_ALLOCATORSTATS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_AllocatorStats, largest_free_block_bytes)
+
+// Potential states for an SP_Event. If `poll_for_status` returns anything aside
+// from kPending or kComplete, an error has occurred; kUnknown is a bad state.
+typedef enum SE_EventStatus {
+  SE_EVENT_UNKNOWN,
+  SE_EVENT_ERROR,
+  SE_EVENT_PENDING,
+  SE_EVENT_COMPLETE,
+} SE_EventStatus;
+
+// Memory allocation information.
+// This matches DeviceMemoryBase defined here:
+// https://cs.opensource.google/tensorflow/tensorflow/+/refs/tags/v2.3.0:tensorflow/stream_executor/device_memory.h;l=57
+typedef struct SP_DeviceMemoryBase {
+  size_t struct_size;
+  void* ext;  // free-form data set by plugin
+  // Platform-dependent value representing allocated memory.
+  void* opaque;
+  uint64_t size;     // Size in bytes of this allocation.
+  uint64_t payload;  // Value for plugin's use
+} SP_DeviceMemoryBase;
+
+#define SP_DEVICE_MEMORY_BASE_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_DeviceMemoryBase, size)
+
+typedef struct SP_Device {
+  size_t struct_size;
+  void* ext;        // free-form data set by plugin
+  int32_t ordinal;  // device index
+
+  // Device vendor can store handle to their device representation
+  // here.
+  void* device_handle;
+} SP_Device;
+
+#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, device_handle)
+
+typedef struct SE_CreateDeviceParams {
+  size_t struct_size;
+  void* ext;        // reserved for future use
+  int32_t ordinal;  // device index
+
+  SP_Device* device;  // Input/output, struct_size set by TF for plugin to read.
+                      // Subsequently plugin fills the entire struct.
+} SE_CreateDeviceParams;
+
+#define SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateDeviceParams, device)
+
+typedef struct SP_StreamExecutor {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  /*** ALLOCATION CALLBACKS ***/
+  // Synchronously allocates `size` bytes on the underlying platform and returns
+  // `SP_DeviceMemoryBase` representing that allocation. In the case of failure,
+  // nullptr is returned.
+  // `memory_space` is reserved for a potential future usage and should be set
+  // to 0.
+  void (*allocate)(const SP_Device* device, uint64_t size, int64_t memory_space,
+                   SP_DeviceMemoryBase* mem);
+
+  // Deallocate the device memory previously allocated via this interface.
+  // Deallocation of a nullptr-representative value is permitted.
+  void (*deallocate)(const SP_Device* device, SP_DeviceMemoryBase* memory);
+
+  // Allocates a region of host memory and registers it with the platform API.
+  // Memory allocated in this manner is required for use in asynchronous memcpy
+  // operations, such as `memcpy_dtoh`.
+  void* (*host_memory_allocate)(const SP_Device* device, uint64_t size);
+
+  // Deallocates a region of host memory allocated by `host_memory_allocate`.
+  void (*host_memory_deallocate)(const SP_Device* device, void* mem);
+
+  // Fills SP_AllocatorStats with allocator statistics, if it is available.
+  // If it is not available, return false.
+  TF_Bool (*get_allocator_stats)(const SP_Device* device,
+                                 SP_AllocatorStats* stats);
+  // Fills the underlying device memory usage information, if it is
+  // available. If it is not available (false is returned), free/total need not
+  // be initialized.
+  TF_Bool (*device_memory_usage)(const SP_Device* device, int64_t* free,
+                                 int64_t* total);
+
+  /*** STREAM CALLBACKS ***/
+  // Creates SP_Stream. This call should also allocate stream
+  // resources on the underlying platform and initializes its
+  // internals.
+  void (*create_stream)(const SP_Device* device, SP_Stream* stream,
+                        TF_Status* status);
+
+  // Destroys SP_Stream and deallocates any underlying resources.
+  void (*destroy_stream)(const SP_Device* device, SP_Stream stream);
+
+  // Causes `dependent` to not begin execution until `other` has finished its
+  // last-enqueued work.
+  void (*create_stream_dependency)(const SP_Device* device, SP_Stream dependent,
+                                   SP_Stream other, TF_Status* status);
+
+  // Without blocking the device, retrieve the current stream status.
+  void (*get_stream_status)(const SP_Device* device, SP_Stream stream,
+                            TF_Status* status);
+
+  /*** EVENT CALLBACKS ***/
+  // Create SP_Event. Performs platform-specific allocation and initialization
+  // of an event.
+  void (*create_event)(const SP_Device* device, SP_Event* event,
+                       TF_Status* status);
+
+  // Destroy SE_Event and perform any platform-specific deallocation and
+  // cleanup of an event.
+  void (*destroy_event)(const SP_Device* device, SP_Event event);
+
+  // Requests the current status of the event from the underlying platform.
+  SE_EventStatus (*get_event_status)(const SP_Device* device, SP_Event event);
+  // Inserts the specified event at the end of the specified stream.
+  void (*record_event)(const SP_Device* device, SP_Stream stream,
+                       SP_Event event, TF_Status* status);
+
+  // Wait for the specified event at the end of the specified stream.
+  void (*wait_for_event)(const SP_Device* const device, SP_Stream stream,
+                         SP_Event event, TF_Status* const status);
+
+  /*** TIMER CALLBACKS ***/
+  // Creates SP_Timer. Allocates timer resources on the underlying platform
+  // and initializes its internals, setting `timer` output variable. Sets
+  // values in `timer_fns` struct.
+  void (*create_timer)(const SP_Device* device, SP_Timer* timer,
+                       TF_Status* status);
+
+  // Destroy timer and deallocates timer resources on the underlying platform.
+  void (*destroy_timer)(const SP_Device* device, SP_Timer timer);
+
+  // Records a start event for an interval timer.
+  void (*start_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer,
+                      TF_Status* status);
+
+  // Records a stop event for an interval timer.
+  void (*stop_timer)(const SP_Device* device, SP_Stream stream, SP_Timer timer,
+                     TF_Status* status);
+
+  /*** MEMCPY CALLBACKS ***/
+  // Enqueues a memcpy operation onto stream, with a host destination location
+  // `host_dst` and a device memory source, with target size `size`.
+  void (*memcpy_dtoh)(const SP_Device* device, SP_Stream stream, void* host_dst,
+                      const SP_DeviceMemoryBase* device_src, uint64_t size,
+                      TF_Status* status);
+
+  // Enqueues a memcpy operation onto stream, with a device destination
+  // location and a host memory source, with target size `size`.
+  void (*memcpy_htod)(const SP_Device* device, SP_Stream stream,
+                      SP_DeviceMemoryBase* device_dst, const void* host_src,
+                      uint64_t size, TF_Status* status);
+
+  // Enqueues a memcpy operation onto stream, with a device destination
+  // location and a device memory source, with target size `size`.
+  void (*memcpy_dtod)(const SP_Device* device, SP_Stream stream,
+                      SP_DeviceMemoryBase* device_dst,
+                      const SP_DeviceMemoryBase* device_src, uint64_t size,
+                      TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is
+  // copied from the device source to the host destination.
+  void (*sync_memcpy_dtoh)(const SP_Device* device, void* host_dst,
+                           const SP_DeviceMemoryBase* device_src, uint64_t size,
+                           TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is
+  // copied from the host source to the device destination.
+  void (*sync_memcpy_htod)(const SP_Device* device,
+                           SP_DeviceMemoryBase* device_dst,
+                           const void* host_src, uint64_t size,
+                           TF_Status* status);
+
+  // Blocks the caller while a data segment of the given size is copied from the
+  // device source to the device destination.
+  void (*sync_memcpy_dtod)(const SP_Device* device,
+                           SP_DeviceMemoryBase* device_dst,
+                           const SP_DeviceMemoryBase* device_src, uint64_t size,
+                           TF_Status* status);
+
+  // Causes the host code to synchronously wait for the event to complete.
+  void (*block_host_for_event)(const SP_Device* device, SP_Event event,
+                               TF_Status* status);
+
+  // Synchronizes all activity occurring in the StreamExecutor's context (most
+  // likely a whole device).
+  void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status);
+
+  // Enqueues on a stream a user-specified function to be run on the host.
+  // `callback_arg` should be passed as the first argument to `callback_fn`.
+  TF_Bool (*host_callback)(SP_Device* device, SP_Stream stream,
+                           SE_StatusCallbackFn callback_fn, void* callback_arg);
+} SP_StreamExecutor;
+
+#define SP_STREAMEXECUTOR_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_StreamExecutor, host_callback)
+
+typedef struct SE_CreateStreamExecutorParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  SP_StreamExecutor* stream_executor;  // output, to be filled by plugin
+} SE_CreateStreamExecutorParams;
+
+#define SE_CREATE_STREAM_EXECUTOR_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateStreamExecutorParams, stream_executor)
+
+typedef struct SP_Platform {
+  size_t struct_size;
+
+  void* ext;  // free-form data set by plugin
+
+  // Platform name. Must be null-terminated.
+  const char* name;
+
+  // Device type name, for example GPU. Must be null-terminated.
+  const char* type;
+
+  // Number of visible devices
+  size_t visible_device_count;
+
+  // Callbacks for creating/destroying SP_Device.
+  void (*create_device)(SE_CreateDeviceParams* params, TF_Status* status);
+
+  // Clean up fields inside SP_Device that were allocated
+  // by the plugin. `device` itself should not be deleted here.
+  void (*destroy_device)(SP_Device* device);
+
+  // Callbacks for creating/destroying SP_StreamExecutor.
+  void (*create_stream_executor)(SE_CreateStreamExecutorParams* params,
+                                 TF_Status* status);
+  // Clean up fields inside SP_StreamExecutor that were allocated
+  // by the plugin. `stream_executor` itself should not be deleted here.
+  void (*destroy_stream_executor)(SP_StreamExecutor* stream_executor);
+
+  // Callbacks for creating/destroying SP_TimerFns.
+  void (*create_timer_fns)(SP_TimerFns* timer, TF_Status* status);
+
+  void (*destroy_timer_fns)(SP_TimerFns* timer_fns);
+} SP_Platform;
+
+#define SP_PLATFORM_STRUCT_SIZE TF_OFFSET_OF_END(SP_Platform, destroy_timer_fns)
+
+typedef struct SE_PlatformRegistrationParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // StreamExecutor C API version.
+  int32_t major_version;
+  int32_t minor_version;
+  int32_t revision_version;
+
+  SP_Platform* platform;  // output, set by plugin
+  // Clean up fields inside SP_Platform that were allocated
+  // by the plugin. `platform` itself should not be deleted here.
+  void (*destroy_platform)(SP_Platform* platform);  // out, set by plugin
+} SE_PlatformRegistrationParams;
+
+#define SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_PlatformRegistrationParams, destroy_platform)
+
+void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
new file mode 100644
index 00000000000..2285fe85867
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Classes and utilities that work with StreamExecutor C API for internal use.
+// This includes functions used for device registration and interfaces needed
+// for testing.
+#ifndef TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+#define TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+
+// Plugin initialization function that a device plugin
+// must define.
+typedef void (*SEPluginInitFn)(SE_PlatformRegistrationParams* const,
+                               TF_Status* const);
+
+// Loads dso and registers StreamExecutor-based pluggable device.
+port::Status RegisterDevicePlugin(const std::string& dso_path);
+
+// Allow registering a plugin using a function (used for testing).
+port::Status RegisterDevicePlugin(SEPluginInitFn init_fn);
+
+class CPlatform : public Platform {
+ public:
+  explicit CPlatform(SP_Platform platform,
+                     void (*destroy_platform)(SP_Platform*),
+                     SP_StreamExecutor stream_executor, SP_TimerFns timer_fns);
+  ~CPlatform() override;
+
+  Id id() const override { return const_cast<int*>(&plugin_id_value_); }
+  const std::string& Name() const override { return name_; }
+  int VisibleDeviceCount() const override {
+    return platform_.visible_device_count;
+  }
+  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+      int ordinal) const override;
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& plugin_config) override;
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  // Trace listener is not supported
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override {
+    LOG(FATAL) << "RegisterTraceListener is not supported by pluggable device";
+  }
+  void UnregisterTraceListener(TraceListener* listener) override {}
+
+  void DestroyAllExecutors() { executor_cache_.DestroyAllExecutors(); }
+
+ private:
+  SP_Platform platform_;
+  void (*destroy_platform_)(SP_Platform*);
+  SP_StreamExecutor stream_executor_;
+  SP_TimerFns timer_fns_;
+  const std::string name_;
+  int plugin_id_value_;
+  stream_executor::ExecutorCache executor_cache_;
+};
+
+}  // namespace stream_executor
+#endif  // TENSORFLOW_C_EXPERIMENTAL_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
new file mode 100644
index 00000000000..86fe00fe5ad
--- /dev/null
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -0,0 +1,802 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0(the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/stream_executor/stream_executor.h"
+
+#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+struct SP_Stream_st {
+  explicit SP_Stream_st(int id) : stream_id(id) {}
+  int stream_id;
+};
+
+struct SP_Event_st {
+  explicit SP_Event_st(int id) : event_id(id) {}
+  int event_id;
+};
+
+struct SP_Timer_st {
+  explicit SP_Timer_st(int id) : timer_id(id) {}
+  int timer_id;
+};
+
+namespace stream_executor {
+namespace {
+constexpr int DEVICE_COUNT = 2;
+constexpr char DEVICE_NAME[] = "MyDevice";
+constexpr char DEVICE_TYPE[] = "GPU";
+
+/*** Create SP_StreamExecutor (with empty functions) ***/
+void allocate(const SP_Device* const device, uint64_t size,
+              int64_t memory_space, SP_DeviceMemoryBase* const mem) {}
+void deallocate(const SP_Device* const device, SP_DeviceMemoryBase* const mem) {
+}
+TF_Bool get_allocator_stats(const SP_Device* const device,
+                            SP_AllocatorStats* const stats) {
+  return true;
+}
+TF_Bool device_memory_usage(const SP_Device* const device, int64_t* const free,
+                            int64_t* const total) {
+  return true;
+}
+void create_stream(const SP_Device* const device, SP_Stream* stream,
+                   TF_Status* const status) {
+  stream = nullptr;
+}
+void destroy_stream(const SP_Device* const device, SP_Stream stream) {}
+void create_stream_dependency(const SP_Device* const device,
+                              SP_Stream dependent, SP_Stream other,
+                              TF_Status* const status) {}
+void get_stream_status(const SP_Device* const device, SP_Stream stream,
+                       TF_Status* const status) {}
+void create_event(const SP_Device* const device, SP_Event* event,
+                  TF_Status* const status) {
+  event = nullptr;
+}
+void destroy_event(const SP_Device* const device, SP_Event event) {}
+SE_EventStatus get_event_status(const SP_Device* const device, SP_Event event) {
+  return SE_EVENT_UNKNOWN;
+}
+void record_event(const SP_Device* const device, SP_Stream stream,
+                  SP_Event event, TF_Status* const status) {}
+void wait_for_event(const SP_Device* const device, SP_Stream stream,
+                    SP_Event event, TF_Status* const status) {}
+void create_timer(const SP_Device* const device, SP_Timer* timer,
+                  TF_Status* const status) {}
+void destroy_timer(const SP_Device* const device, SP_Timer timer) {}
+void start_timer(const SP_Device* const device, SP_Stream stream,
+                 SP_Timer timer, TF_Status* const status) {}
+void stop_timer(const SP_Device* const device, SP_Stream stream, SP_Timer timer,
+                TF_Status* const status) {}
+void memcpy_dtoh(const SP_Device* const device, SP_Stream stream,
+                 void* host_dst, const SP_DeviceMemoryBase* const device_src,
+                 uint64_t size, TF_Status* const status) {}
+void memcpy_htod(const SP_Device* const device, SP_Stream stream,
+                 SP_DeviceMemoryBase* const device_dst, const void* host_src,
+                 uint64_t size, TF_Status* const status) {}
+void sync_memcpy_dtoh(const SP_Device* const device, void* host_dst,
+                      const SP_DeviceMemoryBase* const device_src,
+                      uint64_t size, TF_Status* const status) {}
+void sync_memcpy_htod(const SP_Device* const device,
+                      SP_DeviceMemoryBase* const device_dst,
+                      const void* host_src, uint64_t size,
+                      TF_Status* const status) {}
+void block_host_for_event(const SP_Device* const device, SP_Event event,
+                          TF_Status* const status) {}
+void synchronize_all_activity(const SP_Device* const device,
+                              TF_Status* const status) {}
+TF_Bool host_callback(SP_Device* const device, SP_Stream stream,
+                      SE_StatusCallbackFn const callback_fn,
+                      void* const callback_arg) {
+  return true;
+}
+
+void PopulateDefaultStreamExecutor(SP_StreamExecutor* se) {
+  se->struct_size = SP_STREAMEXECUTOR_STRUCT_SIZE;
+  se->allocate = allocate;
+  se->deallocate = deallocate;
+  se->get_allocator_stats = get_allocator_stats;
+  se->device_memory_usage = device_memory_usage;
+  se->create_stream = create_stream;
+  se->destroy_stream = destroy_stream;
+  se->create_stream_dependency = create_stream_dependency;
+  se->get_stream_status = get_stream_status;
+  se->create_event = create_event;
+  se->destroy_event = destroy_event;
+  se->get_event_status = get_event_status;
+  se->record_event = record_event;
+  se->wait_for_event = wait_for_event;
+  se->create_timer = create_timer;
+  se->destroy_timer = destroy_timer;
+  se->start_timer = start_timer;
+  se->stop_timer = stop_timer;
+  se->memcpy_dtoh = memcpy_dtoh;
+  se->memcpy_htod = memcpy_htod;
+  se->sync_memcpy_dtoh = sync_memcpy_dtoh;
+  se->sync_memcpy_htod = sync_memcpy_htod;
+  se->block_host_for_event = block_host_for_event;
+  se->synchronize_all_activity = synchronize_all_activity;
+  se->host_callback = host_callback;
+}
+
+/*** Create SP_TimerFns ***/
+uint64_t nanoseconds(SP_Timer timer) { return timer->timer_id; }
+
+void PopulateDefaultTimerFns(SP_TimerFns* timer_fns) {
+  timer_fns->nanoseconds = nanoseconds;
+}
+
+/*** Create SP_Platform ***/
+void create_timer_fns(SP_TimerFns* timer_fns, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  PopulateDefaultTimerFns(timer_fns);
+}
+void destroy_timer_fns(SP_TimerFns* timer_fns) {}
+
+void create_stream_executor(SE_CreateStreamExecutorParams* params,
+                            TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  PopulateDefaultStreamExecutor(params->stream_executor);
+}
+void destroy_stream_executor(SP_StreamExecutor* se) {}
+
+void create_device(SE_CreateDeviceParams* params, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  params->device->struct_size = SP_DEVICE_STRUCT_SIZE;
+}
+void destroy_device(SP_Device* device) {}
+
+void PopulateDefaultPlatform(SP_Platform* platform) {
+  platform->struct_size = SP_PLATFORM_STRUCT_SIZE;
+  platform->name = DEVICE_NAME;
+  platform->type = DEVICE_TYPE;
+  platform->visible_device_count = DEVICE_COUNT;
+  platform->create_device = create_device;
+  platform->destroy_device = destroy_device;
+  platform->create_stream_executor = create_stream_executor;
+  platform->destroy_stream_executor = destroy_stream_executor;
+  platform->create_timer_fns = create_timer_fns;
+  platform->destroy_timer_fns = destroy_timer_fns;
+}
+
+void destroy_platform(SP_Platform* const platform) {}
+
+/*** Registration tests ***/
+TEST(StreamExecutor, SuccessfulRegistration) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform);
+    params->destroy_platform = destroy_platform;
+  };
+  port::Status status = RegisterDevicePlugin(plugin_init);
+  TF_ASSERT_OK(status);
+  port::StatusOr<Platform*> maybe_platform =
+      MultiPlatformManager::PlatformWithName("MyDevice");
+  TF_ASSERT_OK(maybe_platform.status());
+  Platform* platform = maybe_platform.ConsumeValueOrDie();
+  ASSERT_EQ(platform->Name(), DEVICE_NAME);
+  ASSERT_EQ(platform->VisibleDeviceCount(), DEVICE_COUNT);
+
+  port::StatusOr<StreamExecutor*> maybe_executor =
+      platform->ExecutorForDevice(0);
+  TF_ASSERT_OK(maybe_executor.status());
+  StreamExecutor* executor = maybe_executor.ConsumeValueOrDie();
+  ASSERT_EQ(executor->GetDeviceDescription().name(), "MyDevice");
+}
+
+TEST(StreamExecutor, NameNotSet) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform);
+    params->platform->name = nullptr;
+    params->destroy_platform = destroy_platform;
+  };
+
+  port::Status status = RegisterDevicePlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_EQ(status.error_message(), "'name' field in SP_Platform must be set.");
+}
+
+TEST(StreamExecutor, CreateDeviceNotSet) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform);
+    params->platform->create_device = nullptr;
+    params->destroy_platform = destroy_platform;
+  };
+
+  port::Status status = RegisterDevicePlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_EQ(status.error_message(),
+            "'create_device' field in SP_Platform must be set.");
+}
+
+/*** StreamExecutor behavior tests ***/
+class StreamExecutorTest : public ::testing::Test {
+ protected:
+  StreamExecutorTest() {}
+  void SetUp() override {
+    PopulateDefaultPlatform(&platform_);
+    PopulateDefaultStreamExecutor(&se_);
+    PopulateDefaultTimerFns(&timer_fns_);
+  }
+  void TearDown() override {}
+
+  StreamExecutor* GetExecutor(int ordinal) {
+    if (!cplatform_) {
+      cplatform_ = absl::make_unique<CPlatform>(platform_, destroy_platform,
+                                                se_, timer_fns_);
+    }
+    port::StatusOr<StreamExecutor*> maybe_executor =
+        cplatform_->ExecutorForDevice(ordinal);
+    TF_CHECK_OK(maybe_executor.status());
+    return maybe_executor.ConsumeValueOrDie();
+  }
+  SP_Platform platform_;
+  SP_StreamExecutor se_;
+  SP_TimerFns timer_fns_;
+  std::unique_ptr<CPlatform> cplatform_;
+};
+
+TEST_F(StreamExecutorTest, Allocate) {
+  se_.allocate = [](const SP_Device* const device, uint64_t size,
+                    int64_t memory_space, SP_DeviceMemoryBase* const mem) {
+    mem->struct_size = SP_DEVICE_MEMORY_BASE_STRUCT_SIZE;
+    mem->opaque = std::malloc(size);
+    mem->size = size;
+  };
+  se_.deallocate = [](const SP_Device* const device,
+                      SP_DeviceMemoryBase* const mem) {
+    EXPECT_EQ(mem->size, 2 * sizeof(int));
+    std::free(mem->opaque);
+    mem->opaque = nullptr;
+    mem->size = 0;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  DeviceMemory<int> mem = executor->AllocateArray<int>(2);
+  ASSERT_NE(mem.opaque(), nullptr);
+  ASSERT_EQ(mem.size(), 2 * sizeof(int));
+  executor->Deallocate(&mem);
+  ASSERT_EQ(mem.opaque(), nullptr);
+}
+
+TEST_F(StreamExecutorTest, HostMemoryAllocate) {
+  static bool allocate_called = false;
+  static bool deallocate_called = false;
+  se_.host_memory_allocate = [](const SP_Device* const device, uint64_t size) {
+    allocate_called = true;
+    return std::malloc(size);
+  };
+  se_.host_memory_deallocate = [](const SP_Device* const device, void* mem) {
+    std::free(mem);
+    deallocate_called = true;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(allocate_called);
+  void* mem = executor->HostMemoryAllocate(8);
+  ASSERT_NE(mem, nullptr);
+  ASSERT_TRUE(allocate_called);
+  ASSERT_FALSE(deallocate_called);
+  executor->HostMemoryDeallocate(mem);
+  ASSERT_TRUE(deallocate_called);
+}
+
+TEST_F(StreamExecutorTest, GetAllocatorStats) {
+  se_.get_allocator_stats = [](const SP_Device* const device,
+                               SP_AllocatorStats* const stat) -> TF_Bool {
+    stat->struct_size = SP_ALLOCATORSTATS_STRUCT_SIZE;
+    stat->bytes_in_use = 123;
+    return true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  absl::optional<AllocatorStats> optional_stats = executor->GetAllocatorStats();
+  ASSERT_TRUE(optional_stats.has_value());
+  AllocatorStats stats = optional_stats.value();
+  ASSERT_EQ(stats.bytes_in_use, 123);
+}
+
+TEST_F(StreamExecutorTest, DeviceMemoryUsage) {
+  se_.device_memory_usage = [](const SP_Device* const device,
+                               int64_t* const free,
+                               int64_t* const total) -> TF_Bool {
+    *free = 45;
+    *total = 7;
+    return true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  int64 free = 0;
+  int64 total = 0;
+  executor->DeviceMemoryUsage(&free, &total);
+  ASSERT_EQ(free, 45);
+  ASSERT_EQ(total, 7);
+}
+
+TEST_F(StreamExecutorTest, CreateStream) {
+  static bool stream_created = false;
+  static bool stream_deleted = false;
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) -> void {
+    *stream = new SP_Stream_st(14);
+    stream_created = true;
+  };
+  se_.destroy_stream = [](const SP_Device* const device,
+                          SP_Stream stream) -> void {
+    auto custom_stream = static_cast<SP_Stream_st*>(stream);
+    ASSERT_EQ(custom_stream->stream_id, 14);
+    delete custom_stream;
+    stream_deleted = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(stream_created);
+  Stream* stream = new Stream(executor);
+  stream->Init();
+  ASSERT_TRUE(stream->ok());
+  ASSERT_TRUE(stream_created);
+  ASSERT_FALSE(stream_deleted);
+  delete stream;
+  ASSERT_TRUE(stream_deleted);
+}
+
+TEST_F(StreamExecutorTest, CreateStreamDependency) {
+  static bool create_stream_dependency_called = false;
+  se_.create_stream_dependency = [](const SP_Device* const device,
+                                    SP_Stream dependent, SP_Stream other,
+                                    TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    create_stream_dependency_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream dependent(executor);
+  dependent.Init();
+  Stream other(executor);
+  other.Init();
+  ASSERT_FALSE(create_stream_dependency_called);
+  dependent.ThenWaitFor(&other);
+  ASSERT_TRUE(create_stream_dependency_called);
+}
+
+TEST_F(StreamExecutorTest, StreamStatus) {
+  static bool status_ok = true;
+  se_.get_stream_status = [](const SP_Device* const device, SP_Stream stream,
+                             TF_Status* const status) -> void {
+    if (status_ok) {
+      TF_SetStatus(status, TF_OK, "");
+    } else {
+      TF_SetStatus(status, TF_INTERNAL, "Test error");
+    }
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+  TF_ASSERT_OK(stream.RefreshStatus());
+  status_ok = false;
+  auto updated_status = stream.RefreshStatus();
+  ASSERT_FALSE(stream.ok());
+  ASSERT_EQ(updated_status.error_message(), "Test error");
+}
+
+TEST_F(StreamExecutorTest, CreateEvent) {
+  static bool event_created = false;
+  static bool event_deleted = false;
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) -> void {
+    *event = new SP_Event_st(123);
+    event_created = true;
+  };
+  se_.destroy_event = [](const SP_Device* const device,
+                         SP_Event event) -> void {
+    auto custom_event = static_cast<SP_Event_st*>(event);
+    ASSERT_EQ(custom_event->event_id, 123);
+    delete custom_event;
+    event_deleted = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(event_created);
+  Event* event = new Event(executor);
+  event->Init();
+  ASSERT_TRUE(event_created);
+  ASSERT_FALSE(event_deleted);
+  delete event;
+  ASSERT_TRUE(event_deleted);
+}
+
+TEST_F(StreamExecutorTest, PollForEventStatus) {
+  static SE_EventStatus event_status = SE_EVENT_COMPLETE;
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) -> void {
+    *event = new SP_Event_st(123);
+  };
+  se_.destroy_event = [](const SP_Device* const device,
+                         SP_Event event) -> void { delete event; };
+  se_.get_event_status = [](const SP_Device* const device,
+                            SP_Event event) -> SE_EventStatus {
+    EXPECT_EQ(event->event_id, 123);
+    return event_status;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Event event(executor);
+  event.Init();
+  ASSERT_EQ(event.PollForStatus(), Event::Status::kComplete);
+  event_status = SE_EVENT_ERROR;
+  ASSERT_EQ(event.PollForStatus(), Event::Status::kError);
+}
+
+TEST_F(StreamExecutorTest, RecordAndWaitForEvent) {
+  static bool record_called = false;
+  static bool wait_called = false;
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) -> void {
+    *stream = new SP_Stream_st(1);
+  };
+  se_.destroy_stream = [](const SP_Device* const device,
+                          SP_Stream stream) -> void { delete stream; };
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) -> void {
+    *event = new SP_Event_st(2);
+  };
+  se_.destroy_event = [](const SP_Device* const device,
+                         SP_Event event) -> void { delete event; };
+  se_.record_event = [](const SP_Device* const device, SP_Stream stream,
+                        SP_Event event, TF_Status* const status) {
+    EXPECT_EQ(stream->stream_id, 1);
+    EXPECT_EQ(event->event_id, 2);
+    TF_SetStatus(status, TF_OK, "");
+    record_called = true;
+  };
+  se_.wait_for_event = [](const SP_Device* const device, SP_Stream stream,
+                          SP_Event event, TF_Status* const status) {
+    EXPECT_EQ(stream->stream_id, 1);
+    EXPECT_EQ(event->event_id, 2);
+    TF_SetStatus(status, TF_OK, "");
+    wait_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Event event(executor);
+  event.Init();
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_FALSE(record_called);
+  stream.ThenRecordEvent(&event);
+  ASSERT_TRUE(record_called);
+  ASSERT_FALSE(wait_called);
+  stream.ThenWaitFor(&event);
+  ASSERT_TRUE(wait_called);
+}
+
+TEST_F(StreamExecutorTest, CreateTimer) {
+  static bool timer_created = false;
+  static bool timer_deleted = false;
+  se_.create_timer = [](const SP_Device* const device, SP_Timer* timer,
+                        TF_Status* const status) -> void {
+    *timer = new SP_Timer_st(25);
+    timer_created = true;
+  };
+  se_.destroy_timer = [](const SP_Device* const device,
+                         SP_Timer timer) -> void {
+    auto custom_timer = static_cast<SP_Timer_st*>(timer);
+    EXPECT_EQ(custom_timer->timer_id, 25);
+    delete custom_timer;
+    timer_deleted = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(timer_created);
+  Stream stream(executor);
+  stream.Init();
+  Timer* timer = new Timer(executor);
+  stream.InitTimer(timer);
+  ASSERT_TRUE(stream.ok());
+  ASSERT_TRUE(timer_created);
+  ASSERT_FALSE(timer_deleted);
+  delete timer;
+  ASSERT_TRUE(timer_deleted);
+}
+
+TEST_F(StreamExecutorTest, StartTimer) {
+  static bool start_called = false;
+  static bool stop_called = false;
+  static TF_Code start_timer_status = TF_OK;
+  static TF_Code stop_timer_status = TF_OK;
+  se_.create_timer = [](const SP_Device* const device, SP_Timer* timer,
+                        TF_Status* const status) -> void {
+    *timer = new SP_Timer_st(7);
+  };
+  se_.destroy_timer = [](const SP_Device* const device,
+                         SP_Timer timer) -> void { delete timer; };
+  se_.start_timer = [](const SP_Device* const device, SP_Stream stream,
+                       SP_Timer timer, TF_Status* const status) {
+    TF_SetStatus(status, start_timer_status, "");
+    EXPECT_EQ(timer->timer_id, 7);
+    start_called = true;
+  };
+  se_.stop_timer = [](const SP_Device* const device, SP_Stream stream,
+                      SP_Timer timer, TF_Status* const status) {
+    TF_SetStatus(status, stop_timer_status, "");
+    EXPECT_EQ(timer->timer_id, 7);
+    stop_called = true;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  Timer timer(executor);
+  stream.InitTimer(&timer);
+
+  // Check both start and stop succeed
+  ASSERT_FALSE(start_called);
+  stream.ThenStartTimer(&timer);
+  ASSERT_TRUE(start_called);
+  ASSERT_FALSE(stop_called);
+  stream.ThenStopTimer(&timer);
+  ASSERT_TRUE(stop_called);
+
+  // Check start timer fails
+  ASSERT_TRUE(stream.ok());
+  start_timer_status = TF_UNKNOWN;
+  stream.ThenStartTimer(&timer);
+  ASSERT_FALSE(stream.ok());
+
+  // Check stop timer fails
+  start_timer_status = TF_OK;
+  stop_timer_status = TF_UNKNOWN;
+  Stream stream2(executor);
+  stream2.Init();
+  Timer timer2(executor);
+  stream2.InitTimer(&timer2);
+  stream2.ThenStartTimer(&timer2);
+  ASSERT_TRUE(stream2.ok());
+  stream2.ThenStopTimer(&timer2);
+  ASSERT_FALSE(stream2.ok());
+}
+
+TEST_F(StreamExecutorTest, TimerFns) {
+  se_.create_timer = [](const SP_Device* const device, SP_Timer* timer,
+                        TF_Status* const status) -> void {
+    *timer = new SP_Timer_st(25000);
+  };
+  se_.destroy_timer = [](const SP_Device* const device,
+                         SP_Timer timer) -> void { delete timer; };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  Timer timer(executor);
+  stream.InitTimer(&timer);
+  // Our test nanoseconds function just returns value
+  // passed to SP_Timer_st constructor.
+  ASSERT_EQ(timer.Nanoseconds(), 25000);
+  ASSERT_EQ(timer.Microseconds(), 25);
+}
+
+TEST_F(StreamExecutorTest, MemcpyToHost) {
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) -> void {
+    *stream = new SP_Stream_st(14);
+  };
+  se_.destroy_stream = [](const SP_Device* const device,
+                          SP_Stream stream) -> void { delete stream; };
+
+  se_.memcpy_dtoh = [](const SP_Device* const device, SP_Stream stream,
+                       void* host_dst,
+                       const SP_DeviceMemoryBase* const device_src,
+                       uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    EXPECT_EQ(stream->stream_id, 14);
+    std::memcpy(host_dst, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  size_t size = sizeof(int);
+  int src_data = 34;
+  int dst_data = 2;
+  DeviceMemoryBase device_src(&src_data, size);
+  Stream& stream_ref = stream.ThenMemcpy(&dst_data, device_src, size);
+  ASSERT_EQ(dst_data, 34);
+  ASSERT_EQ(stream_ref.implementation(), stream.implementation());
+}
+
+TEST_F(StreamExecutorTest, MemcpyFromHost) {
+  se_.memcpy_htod = [](const SP_Device* const device, SP_Stream stream,
+                       SP_DeviceMemoryBase* const device_dst,
+                       const void* host_src, uint64_t size,
+                       TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(device_dst->opaque, host_src, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  stream.ThenMemcpy(&device_dst, &src_data, size);
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, MemcpyDeviceToDevice) {
+  se_.memcpy_dtod = [](const SP_Device* const device, SP_Stream stream,
+                       SP_DeviceMemoryBase* const device_dst,
+                       const SP_DeviceMemoryBase* const device_src,
+                       uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(device_dst->opaque, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  DeviceMemoryBase device_src(&src_data, size);
+  stream.ThenMemcpy(&device_dst, device_src, size);
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, SyncMemcpyToHost) {
+  se_.sync_memcpy_dtoh = [](const SP_Device* const device, void* host_dst,
+                            const SP_DeviceMemoryBase* const device_src,
+                            uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(host_dst, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  size_t size = sizeof(int);
+  int src_data = 34;
+  int dst_data = 2;
+  DeviceMemoryBase device_src(&src_data, size);
+  TF_ASSERT_OK(executor->SynchronousMemcpyD2H(device_src, size, &dst_data));
+  ASSERT_EQ(dst_data, 34);
+}
+
+TEST_F(StreamExecutorTest, SyncMemcpyFromHost) {
+  se_.sync_memcpy_htod =
+      [](const SP_Device* const device, SP_DeviceMemoryBase* const device_dst,
+         const void* host_src, uint64_t size, TF_Status* const status) {
+        TF_SetStatus(status, TF_OK, "");
+        std::memcpy(device_dst->opaque, host_src, size);
+      };
+
+  StreamExecutor* executor = GetExecutor(0);
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  TF_ASSERT_OK(executor->SynchronousMemcpyH2D(&src_data, size, &device_dst));
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, SyncMemcpyDeviceToDevice) {
+  se_.sync_memcpy_dtod = [](const SP_Device* const device,
+                            SP_DeviceMemoryBase* const device_dst,
+                            const SP_DeviceMemoryBase* const device_src,
+                            uint64_t size, TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    std::memcpy(device_dst->opaque, device_src->opaque, size);
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  size_t size = sizeof(int);
+  int src_data = 18;
+  int dst_data = 0;
+  DeviceMemoryBase device_dst(&dst_data, size);
+  DeviceMemoryBase device_src(&src_data, size);
+  ASSERT_TRUE(executor->SynchronousMemcpy(&device_dst, device_src, size));
+  ASSERT_EQ(dst_data, 18);
+}
+
+TEST_F(StreamExecutorTest, BlockHostForEvent) {
+  static bool block_host_for_event_called = false;
+  se_.create_event = [](const SP_Device* const device, SP_Event* event,
+                        TF_Status* const status) {
+    *event = new SP_Event_st(357);
+  };
+  se_.destroy_event = [](const SP_Device* const device, SP_Event event) {
+    delete event;
+  };
+  se_.block_host_for_event = [](const SP_Device* const device, SP_Event event,
+                                TF_Status* const status) -> void {
+    ASSERT_EQ(event->event_id, 357);
+    TF_SetStatus(status, TF_OK, "");
+    block_host_for_event_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_FALSE(block_host_for_event_called);
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  ASSERT_TRUE(block_host_for_event_called);
+}
+
+TEST_F(StreamExecutorTest, SynchronizeAllActivity) {
+  static bool synchronize_all_called = false;
+  se_.synchronize_all_activity = [](const SP_Device* const device,
+                                    TF_Status* const status) {
+    TF_SetStatus(status, TF_OK, "");
+    synchronize_all_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(synchronize_all_called);
+  ASSERT_TRUE(executor->SynchronizeAllActivity());
+  ASSERT_TRUE(synchronize_all_called);
+}
+
+TEST_F(StreamExecutorTest, HostCallbackOk) {
+  se_.host_callback = [](SP_Device* const device, SP_Stream stream,
+                         SE_StatusCallbackFn const callback_fn,
+                         void* const callback_arg) -> TF_Bool {
+    TF_Status* status = TF_NewStatus();
+    callback_fn(callback_arg, status);
+    bool ok = TF_GetCode(status) == TF_OK;
+    TF_DeleteStatus(status);
+    return ok;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  std::function<port::Status()> callback = []() -> port::Status {
+    return port::Status::OK();
+  };
+  stream.ThenDoHostCallbackWithStatus(callback);
+  ASSERT_TRUE(stream.ok());
+}
+
+TEST_F(StreamExecutorTest, HostCallbackError) {
+  se_.host_callback = [](SP_Device* const device, SP_Stream stream,
+                         SE_StatusCallbackFn const callback_fn,
+                         void* const callback_arg) -> TF_Bool {
+    TF_Status* status = TF_NewStatus();
+    callback_fn(callback_arg, status);
+    bool ok = TF_GetCode(status) == TF_OK;
+    TF_DeleteStatus(status);
+    return ok;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  std::function<port::Status()> callback = []() -> port::Status {
+    return port::UnimplementedError("Unimplemented");
+  };
+  stream.ThenDoHostCallbackWithStatus(callback);
+  ASSERT_FALSE(stream.ok());
+}
+}  // namespace
+}  // namespace stream_executor
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 0b12b17c09b..ed501b5b101 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -280,6 +280,36 @@ TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context, int index,
   return tf_tensor;
 }
 
+TF_Tensor* TF_ForwardInputOrAllocateOutput(
+    TF_OpKernelContext* context, int* candidate_input_indices,
+    int num_candidate_input_indices, int output_index, int64_t* output_dims,
+    int output_num_dims, int* forwarded_input, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(context);
+
+  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                "64-bit int types should match in size");
+  tensorflow::gtl::ArraySlice<int> input_indices_array(
+      candidate_input_indices, num_candidate_input_indices);
+  tensorflow::gtl::ArraySlice<tensorflow::int64> output_dimarray(
+      reinterpret_cast<tensorflow::int64*>(output_dims), output_num_dims);
+  tensorflow::Tensor* output_tensor_pointer;
+  tensorflow::Status s = cc_ctx->forward_input_or_allocate_output(
+      input_indices_array, output_index,
+      tensorflow::TensorShape(output_dimarray), &output_tensor_pointer,
+      forwarded_input);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  TF_Tensor* tf_tensor_output = TF_TensorFromTensor(*output_tensor_pointer, &s);
+  if (!s.ok()) {
+    ::tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  return tf_tensor_output;
+}
+
 TF_Tensor* TF_AllocateTemp(TF_OpKernelContext* context, TF_DataType dtype,
                            int64_t* dims, int num_dims,
                            TF_AllocatorAttributes* attributes,
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index 15fcf0f5188..489aa5399a5 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -200,6 +200,17 @@ TF_CAPI_EXPORT TF_Tensor* TF_AllocateOutput(TF_OpKernelContext* context,
                                             int64_t* dims, int num_dims,
                                             size_t len, TF_Status* status);
 
+// Tries to forward one of the inputs given in input_indices to
+// output[output_index]. If none of the given inputs can be forwarded, calls
+// allocate_output() to allocate a new output buffer. The index of the
+// forwarded input will be assign to output argument forwarded_input (if it's
+// not nullptr). If no inputs are forwarded, forwarded_input will be assigned
+// -1.
+TF_CAPI_EXPORT TF_Tensor* TF_ForwardInputOrAllocateOutput(
+    TF_OpKernelContext* context, int* candidate_input_indices,
+    int num_candidate_input_indices, int output_index, int64_t* output_dims,
+    int output_num_dims, int* forwarded_input, TF_Status* status);
+
 // Allocates a temporary Tensor of the specified type and shape. The
 // Tensor must not be used after kernel construction is
 // complete.
diff --git a/tensorflow/c/kernels/histogram_summary_op.cc b/tensorflow/c/kernels/histogram_summary_op.cc
index ada1bd3c630..5de52703f5d 100644
--- a/tensorflow/c/kernels/histogram_summary_op.cc
+++ b/tensorflow/c/kernels/histogram_summary_op.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/c/kernels/summary_op.cc b/tensorflow/c/kernels/summary_op.cc
index bd528da4165..ac7eced0ae7 100644
--- a/tensorflow/c/kernels/summary_op.cc
+++ b/tensorflow/c/kernels/summary_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index e8223e40064..c9df2cc34d1 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -565,6 +565,74 @@ TEST_F(DeviceKernelOpTest, TestAllocateTempSize2x3) {
             output->DebugString(100));
 }
 
+TEST_F(DeviceKernelOpTest, TestForwardInputOrAllocateOutput) {
+  const char* node_name = "TestForwardInputOrAllocateOutputKernel";
+  const char* op_name = "BazOp";
+  const char* device_name = "FakeDeviceName";
+
+  REGISTER_OP(op_name)
+      .Input("input1: float")
+      .Input("input2: float")
+      .Output("output1: float")
+      .Attr("SomeDataTypeAttr: type");
+
+  // A kernel whose Compute function that forwards a scalar input to output
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    TF_Status* s = TF_NewStatus();
+    int candidate_input_indices[1] = {0};
+    int forwarded_input;
+    int64_t output_dims[1] = {};
+    TF_Tensor* output = TF_ForwardInputOrAllocateOutput(
+        /*context=*/ctx, candidate_input_indices,
+        /*num_candidate_input_indices=*/1,
+        /*output_index=*/0, output_dims, /*output_num_dims=*/0,
+        &forwarded_input, /*status=*/s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+    EXPECT_EQ(forwarded_input, 0);
+    EXPECT_EQ(TF_FLOAT, TF_TensorType(output));
+    EXPECT_EQ(0, TF_NumDims(output));
+    TF_DeleteStatus(s);
+    TF_DeleteTensor(output);
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(node_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
+  {
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr);
+    p.device = &dummy_device;
+    AllocatorAttributes alloc_attrs;
+    p.output_attr_array = &alloc_attrs;
+
+    Tensor t(123.0f);
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // GetFakeKernel requires a NodeDef with two inputs
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, node_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<float>()());
+  }
+}
+
 void validate_tensor(TF_Tensor* tensor, int64_t* dims, int64_t num_dims,
                      TF_DataType dtype) {
   EXPECT_EQ(TF_FLOAT, TF_TensorType(tensor));
diff --git a/tensorflow/c/logging.cc b/tensorflow/c/logging.cc
index bf6bf069fff..13c9e6ac208 100644
--- a/tensorflow/c/logging.cc
+++ b/tensorflow/c/logging.cc
@@ -28,6 +28,7 @@ void TF_Log(TF_LogLevel level, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   auto message = BuildMessage(fmt, args);
+  va_end(args);
   switch (level) {
     case TF_INFO:
       LOG(INFO) << message;
@@ -48,6 +49,7 @@ void TF_VLog(int level, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   auto message = BuildMessage(fmt, args);
+  va_end(args);
   VLOG(level) << message;
 }
 
@@ -55,5 +57,6 @@ void TF_DVLog(int level, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
   auto message = BuildMessage(fmt, args);
+  va_end(args);
   DVLOG(level) << message;
 }
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a67d349bab7..a3ea0c75bc7 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -47,6 +47,7 @@ cc_library(
         # TODO(b/111634734): :lib and :protos_all contain dependencies that
         # cannot be built on mobile platforms. Instead, include the appropriate
         # tf_lib depending on the build platform.
+        "@com_google_absl//absl/memory:memory",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ]),
@@ -171,6 +172,7 @@ tf_cc_test(
     deps = [
         ":constants",
         ":loader",
+        ":reader",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/core:lib",
diff --git a/tensorflow/cc/saved_model/experimental/public/BUILD b/tensorflow/cc/saved_model/experimental/public/BUILD
index 3e9a671a61f..9640848ebf5 100644
--- a/tensorflow/cc/saved_model/experimental/public/BUILD
+++ b/tensorflow/cc/saved_model/experimental/public/BUILD
@@ -51,8 +51,32 @@ cc_library(
     deps = [
         ":concrete_function",
         ":concrete_function_list",
+        ":signature_def_function",
         "//tensorflow/c/experimental/saved_model/public:saved_model_api",
         "//tensorflow/cc/experimental/base/public:runtime",
         "//tensorflow/cc/experimental/base/public:status",
     ],
 )
+
+cc_library(
+    name = "signature_def_function",
+    hdrs = [
+        "signature_def_function.h",
+    ],
+    deps = [
+        ":signature_def_function_metadata",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function",
+        "//tensorflow/cc/experimental/base/public:status",
+    ],
+)
+
+cc_library(
+    name = "signature_def_function_metadata",
+    hdrs = [
+        "signature_def_function_metadata.h",
+    ],
+    deps = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function_metadata",
+    ],
+)
diff --git a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
index 04018bf2aab..c2bfb4dcf83 100644
--- a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
+++ b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
+#include "tensorflow/cc/saved_model/experimental/public/signature_def_function.h"
 
 namespace tensorflow {
 namespace experimental {
@@ -80,8 +81,8 @@ class SavedModelAPI {
   //  If status is not OK, returns nullptr. Otherwise, returns a
   //  tensorflow::cc::ConcreteFunction pointer. The lifetime of this pointer
   //  is bound to SavedModelAPI it was loaded from.
-  ConcreteFunction* GetSignatureDefFunction(const std::string& function_path,
-                                            Status* status);
+  SignatureDefFunction* GetSignatureDefFunction(
+      const std::string& function_path, Status* status);
 
   // Lists all Conrete Functions available from the SavedModel.
   std::vector<ConcreteFunction*> ListFunctions();
@@ -140,14 +141,14 @@ inline ConcreteFunction* SavedModelAPI::GetConcreteFunction(
   return ConcreteFunction::wrap(function);
 }
 
-inline ConcreteFunction* SavedModelAPI::GetSignatureDefFunction(
+inline SignatureDefFunction* SavedModelAPI::GetSignatureDefFunction(
     const std::string& function_path, Status* status) {
-  TF_ConcreteFunction* function = TF_GetSavedModelSignatureDefFunction(
+  TF_SignatureDefFunction* function = TF_GetSavedModelSignatureDefFunction(
       saved_model_.get(), function_path.c_str(), status->GetTFStatus());
   if (!status->ok()) {
     return nullptr;
   }
-  return ConcreteFunction::wrap(function);
+  return SignatureDefFunction::wrap(function);
 }
 
 inline std::vector<ConcreteFunction*> SavedModelAPI::ListFunctions() {
diff --git a/tensorflow/cc/saved_model/experimental/public/signature_def_function.h b/tensorflow/cc/saved_model/experimental/public/signature_def_function.h
new file mode 100644
index 00000000000..bc72d208e87
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/signature_def_function.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
+
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SignatureDefFunctions are functions that correspond to either:
+// "signatures" saved from a TF2 SavedModel APIs:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/python/saved_model/save.py#L830-L854
+// Or the "SignatureDefMap" saved from TF1 SavedModel APIs:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/python/saved_model/load_v1_in_v2_test.py#L170-L174
+// In both cases, a SignatureDef is serialized as a SignatureDef protobuf:
+// https://github.com/tensorflow/tensorflow/blob/8ce0600f58ed84a8c84a7bbdb014d1f09e44f4c8/tensorflow/core/protobuf/meta_graph.proto#L260-L330
+// and represents a computation defined by a TF subgraph.
+// These Signatures were primarily designed to be interoperable with the legacy
+// TF 1 Session-based C++ SavedModelBundle loading APIs:
+// https://github.com/tensorflow/tensorflow/blob/26c4ee0c833e74f94d0102d8b005c41a28b44445/tensorflow/cc/saved_model/loader.h#L96-L108
+// SignatureDefFunctions have different semantics from regular TF2
+// ConcreteFunctions, and are mainly intended provide a serving-friendly
+// transition point from the TF1 Session API.
+// First, SignatureDefFunctions have different calling conventions.
+// SignatureDefFunctions' inputs and outputs are constrained to **flattened
+// lists of TensorHandles only**. They do not support more exotic input/output
+// types (like optionals, generators, etc). Additionally, this flattening means
+// they will not preserve the exact interface of the original tf.function they
+// were traced from, as things like composite tensors decay into their
+// internal dense tensor representation.
+// Second, all inputs and outputs are "named", and these names are load bearing
+// (eg: they are part of the interface of tensorflow_serving):
+// https://github.com/tensorflow/serving/blob/e0d247b2e4050713194b8fad0be24a0636df7209/tensorflow_serving/apis/predict.proto#L21
+// https://github.com/tensorflow/serving/blob/e0d247b2e4050713194b8fad0be24a0636df7209/tensorflow_serving/apis/predict.proto#L39
+// The name of each input/output is stored in the corresponding tf::Argument in
+// SignatureDefFunctionMetadata::arguments(). Users must ensure the order of
+// TensorHandles passed to the function matches with the order of named
+// arguments. Similarly the name of the outputs is stored in
+// SignatureDefFunctionMetadata::returns().
+class SignatureDefFunction final {
+ public:
+  // Returns FunctionMetadata associated with this ConcreteFunction.
+  const SignatureDefFunctionMetadata* GetFunctionMetadata();
+
+ private:
+  friend class SavedModelAPI;
+  friend class ConcreteFunctionList;
+
+  // TODO(bmzhao): Consider adding a macro for wrapping/unwrapping
+  // when moving out of experimental.
+  static SignatureDefFunction* wrap(TF_SignatureDefFunction* p) {
+    return reinterpret_cast<SignatureDefFunction*>(p);
+  }
+  static TF_SignatureDefFunction* unwrap(SignatureDefFunction* p) {
+    return reinterpret_cast<TF_SignatureDefFunction*>(p);
+  }
+};
+
+inline const SignatureDefFunctionMetadata*
+SignatureDefFunction::GetFunctionMetadata() {
+  return SignatureDefFunctionMetadata::wrap(
+      TF_SignatureDefFunctionGetMetadata(unwrap(this)));
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h b/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h
new file mode 100644
index 00000000000..6cb01bf1a26
--- /dev/null
+++ b/tensorflow/cc/saved_model/experimental/public/signature_def_function_metadata.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+#define TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
+
+#include <memory>
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// SignatureDefFunctionMetadata stores additional information on each input
+// and output's names, dtypes, and shape.
+class SignatureDefFunctionMetadata final {
+  // TODO(bmzhao): Add getters here as necessary.
+ private:
+  friend class SignatureDefFunction;
+  static SignatureDefFunctionMetadata* wrap(
+      TF_SignatureDefFunctionMetadata* p) {
+    return reinterpret_cast<SignatureDefFunctionMetadata*>(p);
+  }
+  static TF_SignatureDefFunctionMetadata* unwrap(
+      SignatureDefFunctionMetadata* p) {
+    return reinterpret_cast<TF_SignatureDefFunctionMetadata*>(p);
+  }
+};
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index f9c720a2ba2..ecefe7d0406 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -95,16 +96,6 @@ static Status ValidateSavedTensors(const GraphDef& graph_def) {
   return Status::OK();
 }
 
-Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
-                                const SessionOptions& session_options,
-                                std::unique_ptr<Session>* session) {
-  Session* session_p = nullptr;
-  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
-  session->reset(session_p);
-  TF_RETURN_IF_ERROR(ValidateSavedTensors(meta_graph_def.graph_def()));
-  return (*session)->Create(meta_graph_def.graph_def());
-}
-
 Tensor CreateStringTensor(const string& value) {
   Tensor tensor(DT_STRING, TensorShape({}));
   tensor.scalar<tstring>()() = value;
@@ -228,22 +219,18 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
                  nullptr /* outputs */, &run_metadata, session);
 }
 
-Status ReadSavedModelDebugInfoIfPresent(
-    const string& export_dir,
-    std::unique_ptr<GraphDebugInfo>* debug_info_proto) {
-  LOG(INFO) << "Reading SavedModel debug info (if present) from: "
-            << export_dir;
+}  // namespace
 
-  const string debug_info_pb_path =
-      io::JoinPath(export_dir, "debug", "saved_model_debug_info.pb");
-  if (Env::Default()->FileExists(debug_info_pb_path).ok()) {
-    GraphDebugInfo debug_info;
-    TF_RETURN_IF_ERROR(
-        ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
-    *debug_info_proto =
-        absl::make_unique<GraphDebugInfo>(std::move(debug_info));
-  }
-  return Status::OK();
+SavedModelBundleInterface::~SavedModelBundleInterface() {}
+
+Status LoadMetagraphIntoSession(const SessionOptions& session_options,
+                                const MetaGraphDef& meta_graph,
+                                std::unique_ptr<Session>* session) {
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
+  TF_RETURN_IF_ERROR(ValidateSavedTensors(meta_graph.graph_def()));
+  return (*session)->Create(meta_graph.graph_def());
 }
 
 Status LoadSavedModelInternal(const SessionOptions& session_options,
@@ -251,46 +238,17 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const string& export_dir,
                               const std::unordered_set<string>& tags,
                               SavedModelBundle* const bundle) {
-  const uint64 read_start_microseconds = Env::Default()->NowMicros();
   TF_RETURN_IF_ERROR(ReadMetaGraphDefFromSavedModel(export_dir, tags,
                                                     &bundle->meta_graph_def));
   TF_RETURN_IF_ERROR(
       ReadSavedModelDebugInfoIfPresent(export_dir, &bundle->debug_info));
-  TF_RETURN_IF_ERROR(LoadMetaGraphIntoSession(
-      bundle->meta_graph_def, session_options, &bundle->session));
-
-  std::vector<AssetFileDef> asset_file_defs;
-  TF_RETURN_IF_ERROR(
-      internal::GetAssetFileDefs(bundle->meta_graph_def, &asset_file_defs));
-  TF_RETURN_IF_ERROR(
-      RunRestore(run_options, export_dir,
-                 bundle->meta_graph_def.saver_def().restore_op_name(),
-                 bundle->meta_graph_def.saver_def().filename_tensor_name(),
-                 asset_file_defs, bundle->session.get()));
-  // Record walltime spent in restoring graph from disk, but postpone metric
-  // increments until graph init finishes.
-  const uint64 restore_graph_walltime =
-      GetLatencyMicroseconds(read_start_microseconds);
-
-  const uint64 graph_init_start_microseconds = Env::Default()->NowMicros();
-  string init_op_name;
-  TF_RETURN_IF_ERROR(
-      internal::GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
-  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
-                               asset_file_defs, bundle->session.get(),
-                               init_op_name));
-  load_latency_by_stage->GetCell(export_dir, "restore_graph")
-      ->Add(restore_graph_walltime);
-  // Record wall time spent in init op.
-  load_latency_by_stage->GetCell(export_dir, "init_graph")
-      ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
+  TF_RETURN_IF_ERROR(LoadMetagraphIntoSession(
+      session_options, bundle->meta_graph_def, &bundle->session));
+  TF_RETURN_IF_ERROR(RestoreSession(run_options, bundle->meta_graph_def,
+                                    export_dir, &bundle->session));
   return Status::OK();
 }
 
-}  // namespace
-
-SavedModelBundleInterface::~SavedModelBundleInterface() {}
-
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
@@ -424,6 +382,35 @@ class LiteSessionWrapper : public Session {
 };
 }  // namespace
 
+Status RestoreSession(const RunOptions& run_options,
+                      const MetaGraphDef& meta_graph, const string& export_dir,
+                      std::unique_ptr<Session>* session) {
+  const uint64 read_start_microseconds = Env::Default()->NowMicros();
+  std::vector<AssetFileDef> asset_file_defs;
+  TF_RETURN_IF_ERROR(internal::GetAssetFileDefs(meta_graph, &asset_file_defs));
+  TF_RETURN_IF_ERROR(RunRestore(run_options, export_dir,
+                                meta_graph.saver_def().restore_op_name(),
+                                meta_graph.saver_def().filename_tensor_name(),
+                                asset_file_defs, session->get()));
+  // Record walltime spent in restoring graph from disk, but postpone metric
+  // increments until graph init finishes.
+  const uint64 restore_graph_walltime =
+      GetLatencyMicroseconds(read_start_microseconds);
+
+  const uint64 graph_init_start_microseconds = Env::Default()->NowMicros();
+  string init_op_name;
+  TF_RETURN_IF_ERROR(
+      internal::GetInitOp(export_dir, meta_graph, &init_op_name));
+  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, meta_graph,
+                               asset_file_defs, session->get(), init_op_name));
+  load_latency_by_stage->GetCell(export_dir, "restore_graph")
+      ->Add(restore_graph_walltime);
+  // Record wall time spent in init op.
+  load_latency_by_stage->GetCell(export_dir, "init_graph")
+      ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
+  return Status::OK();
+}
+
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 2b2e44bc619..5ef6070998e 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -96,6 +96,21 @@ class SavedModelBundleLite : public SavedModelBundleInterface {
   protobuf::Map<string, SignatureDef> signatures_;
 };
 
+// Restore variable and resources in the SavedModel export dir for the
+// indicated metagraph.
+// The recommended way to load a saved model is to call LoadSavedModel,
+// which provides an already initialized Metagraph, Session, and DebugInfo.
+Status RestoreSession(const RunOptions& run_options,
+                      const MetaGraphDef& meta_graph, const string& export_dir,
+                      std::unique_ptr<Session>* session);
+
+// Initialize a session which wraps this metagraph.
+// The recommended way to load a saved model is to call LoadSavedModel,
+// which provides an already initialized Metagraph, Session, and DebugInfo.
+Status LoadMetagraphIntoSession(const SessionOptions& session_options,
+                                const MetaGraphDef& meta_graph,
+                                std::unique_ptr<Session>* session);
+
 /// Loads a SavedModel from the specified export directory. The MetaGraphDef
 /// to be loaded is identified by the supplied tags, corresponding exactly to
 /// the set of tags used at SavedModel build time. Stores a SavedModel bundle in
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index d6d99229372..c1d4736f6b9 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_set>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -86,4 +87,22 @@ Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
   return Status::OK();
 }
 
+Status ReadSavedModelDebugInfoIfPresent(
+    const string& export_dir,
+    std::unique_ptr<GraphDebugInfo>* debug_info_proto) {
+  LOG(INFO) << "Reading SavedModel debug info (if present) from: "
+            << export_dir;
+
+  const string debug_info_pb_path =
+      io::JoinPath(export_dir, "debug", "saved_model_debug_info.pb");
+  if (Env::Default()->FileExists(debug_info_pb_path).ok()) {
+    GraphDebugInfo debug_info;
+    TF_RETURN_IF_ERROR(
+        ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
+    *debug_info_proto =
+        absl::make_unique<GraphDebugInfo>(std::move(debug_info));
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/reader.h b/tensorflow/cc/saved_model/reader.h
index 5815108df2a..602f6cb21c1 100644
--- a/tensorflow/cc/saved_model/reader.h
+++ b/tensorflow/cc/saved_model/reader.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
@@ -34,6 +35,11 @@ Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
                                       const std::unordered_set<string>& tags,
                                       MetaGraphDef* const meta_graph_def);
 
+// Store debug info from the SavedModel export dir.
+Status ReadSavedModelDebugInfoIfPresent(
+    const string& export_dir,
+    std::unique_ptr<GraphDebugInfo>* debug_info_proto);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_READER_H_
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index bc630bcaede..b5e8b67a123 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -106,5 +106,11 @@ TEST_F(ReaderTest, InvalidExportPath) {
   EXPECT_FALSE(st.ok());
 }
 
+TEST_F(ReaderTest, ReadSavedModelDebugInfoIfPresent) {
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
+  std::unique_ptr<GraphDebugInfo> debug_info_proto;
+  TF_ASSERT_OK(ReadSavedModelDebugInfoIfPresent(export_dir, &debug_info_proto));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
index d6c375c7448..31f676920aa 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/cc/saved_model/loader.h"
-
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/cc/saved_model/signature_constants.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -131,6 +132,43 @@ TEST_F(LoaderTest, TagMatch) {
   CheckSavedModelBundle(export_dir, bundle);
 }
 
+TEST_F(LoaderTest, ReadMetaGraphFromSavedModel) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  MetaGraphDef actual_metagraph;
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
+                                              &actual_metagraph));
+  EXPECT_EQ(actual_metagraph.DebugString(),
+            bundle.meta_graph_def.DebugString());
+}
+
+TEST_F(LoaderTest, RestoreSession) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+
+  SavedModelBundle actual_bundle;
+  const std::unordered_set<std::string> tags = {kSavedModelTagServe};
+  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, tags,
+                                              &actual_bundle.meta_graph_def));
+  TF_ASSERT_OK(LoadMetagraphIntoSession(
+      session_options, actual_bundle.meta_graph_def, &actual_bundle.session));
+  TF_ASSERT_OK(RestoreSession(run_options, actual_bundle.meta_graph_def,
+                              export_dir, &actual_bundle.session));
+  CheckSavedModelBundle(export_dir, actual_bundle);
+}
+
 TEST_F(LoaderTest, NoTagMatch) {
   SavedModelBundle bundle;
   RunOptions run_options;
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index b1525337dbc..971a5383f6b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -278,16 +278,14 @@ Status XlaCompilationCache::CompileSingleOp(
     const NodeDef& node_def = ctx->op_kernel().def();
     TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(node_def, args, result_dtypes));
 
-    bool are_args_supported =
-        absl::c_all_of(args, [](const XlaCompiler::Argument arg) {
-          return arg.kind == XlaCompiler::Argument::kConstant ||
-                 arg.kind == XlaCompiler::Argument::kParameter;
+    bool has_tensor_list_arg =
+        absl::c_any_of(args, [](const XlaCompiler::Argument arg) {
+          return arg.kind == XlaCompiler::Argument::kTensorList;
         });
     const ConfigProto* config = ctx->function_library()->config_proto();
     bool use_mlir = config && config->experimental().enable_mlir_bridge();
-    // TODO(b/155596779): Understand the source of other argument types and
-    // depending on the source either support those or avoid these codepath.
-    if (!use_mlir || !are_args_supported) {
+    // TODO(b/155596779): Support TensorList args.
+    if (!use_mlir || !has_tensor_list_arg) {
       return compiler->CompileGraph(compile_options, node_def.name(),
                                     std::move(graph), args, result);
     }
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 01c187790b7..d8b4fe5bcef 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -40,13 +40,16 @@ cc_library(
     srcs = ["tf_mlir_opt_main.cc"],
     deps = [
         ":init_mlir",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirOptLib",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -127,9 +130,7 @@ tf_cc_binary(
     deps = [
         ":passes",
         ":tf_mlir_opt_main",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tfjs:tensorflow_js_dialect_registration",
         "//tensorflow/compiler/mlir/xla:all_xla_passes_for_testing",
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index 126d44670a0..7be39aef9da 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -813,7 +813,8 @@ cc_binary(
     ],
     deps = [
         ":all_passes",
-        ":hlo_dialect_registration",
+        ":hlo",
+        ":lhlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
index ad044e1d322..4286c837a24 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
@@ -56,19 +56,9 @@ class MhloDialect : public Dialect {
   void printType(Type type, DialectAsmPrinter &os) const override;
 };
 
-namespace HLOTypes {
-enum Kind {
-  Token = Type::FIRST_XLA_HLO_TYPE,
-};
-}  // namespace HLOTypes
-
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static TokenType get(MLIRContext *context) {
-    return Base::get(context, HLOTypes::Token);
-  }
 };
 
 // Shape derivation function that computes the shape of the result based on
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
index 3fa46584ca2..750cce65b62 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
@@ -81,6 +81,8 @@ def LHLO_ConstOp : LHLO_Op<"constant", []>, BASE_HLO_ConstOp {
     ElementsAttr:$value,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
+
+  let hasCanonicalizer = 1;
 }
 
 def LHLO_IotaOp : LHLO_Op<"iota", []>, BASE_HLO_IotaOp {
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
index 1e335ae6b82..74ea9c9b1a7 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h
@@ -65,9 +65,24 @@ static ElementsAttr getSplat(Builder* b, Value val, T constant) {
 
 // Returns DenseElementsAttr of rank zero with the given element type and the
 // value.
-// Requires `ty` to be either FloatType of IntegerType.
+// Requires `ty` to be either FloatType, IntegerType, or ComplexType.
 DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value);
 
+// Enum type used to specify scalar argument to GetScalarLimitOfType.
+enum ScalarLimit {
+  kLowest,          // The scalar corresponding to numeric_limits<T>::lowest.
+  kInfinityLowest,  // Like kMax, but returns -infinity where available.
+  kMax,             // The scalar corresponding to numeric_limits<T>::max.
+  kInfinityMax,     // Like kMax, but returns infinity where available.
+};
+
+// Returns a scalar limit value for the given type.
+//
+// The argument 'limit' describes which scalar value to return.
+//
+// Requires `ty` to be either FloatType or IntegerType.
+DenseElementsAttr GetScalarLimitOfType(Type ty, ScalarLimit limit);
+
 }  // namespace hlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
index f61a66397e7..81407c89204 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Dialect.h"
@@ -56,6 +57,38 @@ LmhloDialect::LmhloDialect(MLIRContext *context)
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// ConstOp.
+//===----------------------------------------------------------------------===//
+
+/// An lho.constant on an memref that is locally allocated and with no other
+/// users (other than dealloc's) can be erased.
+// TODO: This can be generalized to an arbitrary op by making use of memory
+// effects (write memory effect).
+struct EraseConstOp : public OpRewritePattern<ConstOp> {
+  using OpRewritePattern<ConstOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConstOp op,
+                                PatternRewriter& rewriter) const override {
+    Value memref = op.output();
+    if (!memref.getDefiningOp<AllocOp>()) {
+      return failure();
+    }
+
+    // Check that all uses of the memref are either DeallocOps or this op.
+    for (Operation* user : memref.getUsers())
+      if (user != op && !isa<DeallocOp>(user)) return failure();
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+void ConstOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
+                                          MLIRContext* context) {
+  results.insert<EraseConstOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // StaticMemRefCastOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index f47f2c2fbdc..033021c36ac 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
 
+#include <numeric>
+
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
@@ -598,6 +600,7 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
     unsigned currSrcDim = 0, currDstDim = 0;
     SmallVector<linalg::ReassociationExprs, 4> reassociationMap(
         dstShape.size());
+    bool isExpandingOrCollapsing = true;
     while (currSrcDim < srcShape.size() && currDstDim < dstShape.size()) {
       int64_t dstSize = dstShape[currDstDim];
       int64_t srcSize = srcShape[currSrcDim];
@@ -619,11 +622,47 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
           }
         }
       } else {
-        return failure();
+        isExpandingOrCollapsing = false;
+        break;
       }
       currDstDim++;
     }
-    if (currSrcDim != srcShape.size()) return failure();
+    if (currSrcDim != srcShape.size()) isExpandingOrCollapsing = false;
+
+    if (!isExpandingOrCollapsing) {
+      auto getIdentityExprs = [&rewriter](int n) {
+        SmallVector<AffineExpr, 4> exprs;
+        for (int i = 0; i < n; ++i)
+          exprs.push_back(rewriter.getAffineDimExpr(i));
+        return exprs;
+      };
+      Location loc = reshapeOp.getLoc();
+      int64_t totalElems = std::accumulate(srcShape.begin(), srcShape.end(), 1,
+                                           std::multiplies<int64_t>());
+      auto elemType = operandType.getElementType();
+      SmallVector<linalg::ReassociationExprs, 4> collapsingMap = {
+          getIdentityExprs(dstShape.size())};
+      SmallVector<linalg::ReassociationExprs, 4> expandingMap = {
+          getIdentityExprs(srcShape.size())};
+
+      if (isLHLO) {
+        auto collapsedType = MemRefType::get({totalElems}, elemType);
+        Value collapsedOp = rewriter.create<linalg::ReshapeOp>(
+            loc, collapsedType, args[0], collapsingMap);
+        Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
+            loc, resultType, collapsedOp, expandingMap);
+        rewriter.replaceOpWithNewOp<linalg::CopyOp>(
+            reshapeOp, reshapeBuffer, args[1], /*inputPermutation =*/nullptr,
+            /*outputPermutation =*/nullptr);
+      } else {
+        auto collapsedType = RankedTensorType::get({totalElems}, elemType);
+        Value collapsedOp = rewriter.create<linalg::TensorReshapeOp>(
+            loc, collapsedType, args[0], collapsingMap);
+        rewriter.replaceOpWithNewOp<linalg::TensorReshapeOp>(
+            reshapeOp, resultType, collapsedOp, expandingMap);
+      }
+      return success();
+    }
 
     if (isLHLO) {
       Value reshapeBuffer = rewriter.create<linalg::ReshapeOp>(
diff --git a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
index df2442cc4b6..0bbd91e0680 100644
--- a/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/utils/hlo_utils.cc
@@ -60,10 +60,76 @@ DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value) {
   if (auto float_ty = ty.dyn_cast<FloatType>()) {
     APFloat value(float_ty.getFloatSemantics(), raw_value);
     return DenseElementsAttr::get(scalar_ty, value);
+  } else if (auto int_ty = ty.dyn_cast<IntegerType>()) {
+    APInt value(int_ty.getWidth(), static_cast<int64_t>(raw_value), true);
+    return DenseElementsAttr::get(scalar_ty, value);
+  } else if (auto complex_ty = ty.dyn_cast<ComplexType>()) {
+    Type complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<float>>(raw_value));
+    } else if (complex_element_ty.isF64()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<double>>(raw_value));
+    }
   }
-  auto int_ty = ty.cast<IntegerType>();
-  APInt value(int_ty.getWidth(), static_cast<int64_t>(raw_value), true);
-  return DenseElementsAttr::get(scalar_ty, value);
+  llvm_unreachable("unsupported type");
+}
+
+static APFloat GetScalarLimitOfFloatType(FloatType float_ty,
+                                         ScalarLimit limit) {
+  auto &semantics = float_ty.getFloatSemantics();
+  switch (limit) {
+    case kLowest:
+      return APFloat::getLargest(semantics, /*negative=*/true);
+    case kInfinityLowest:
+      return APFloat::getInf(semantics, /*negative=*/true);
+    case kMax:
+      return APFloat::getLargest(semantics, /*negative=*/false);
+    case kInfinityMax:
+      return APFloat::getInf(semantics, /*negative=*/false);
+  }
+  llvm_unreachable("invalid limit");
+}
+
+// Returns a scalar value for the given integer type.
+//
+// The argument 'scalar' describes which scalar value to return. `integer_value`
+// is used to specify the integer value for kInteger. For any other scalar,
+// integer_value is ignored.
+static APInt GetScalarLimitOfIntegerType(IntegerType integer_ty,
+                                         ScalarLimit limit) {
+  unsigned width = integer_ty.getWidth();
+  switch (limit) {
+    case kLowest:
+    case kInfinityLowest:
+      if (integer_ty.isUnsigned()) {
+        return APInt::getMinValue(width);
+      } else {
+        return APInt::getSignedMinValue(width);
+      }
+
+    case kMax:
+    case kInfinityMax:
+      if (integer_ty.isUnsigned()) {
+        return APInt::getMaxValue(width);
+      } else {
+        return APInt::getSignedMaxValue(width);
+      }
+  }
+  llvm_unreachable("invalid limit");
+}
+
+DenseElementsAttr GetScalarLimitOfType(Type ty, ScalarLimit limit) {
+  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
+  if (auto float_ty = ty.dyn_cast<FloatType>()) {
+    return DenseElementsAttr::get(scalar_ty,
+                                  GetScalarLimitOfFloatType(float_ty, limit));
+  } else if (auto integer_ty = ty.dyn_cast<IntegerType>()) {
+    return DenseElementsAttr::get(
+        scalar_ty, GetScalarLimitOfIntegerType(integer_ty, limit));
+  }
+  llvm_unreachable("unsupported type");
 }
 
 }  // namespace hlo
diff --git a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
index 15b1a150fdd..0d20c3f517b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
@@ -597,3 +597,24 @@ func @unpack_repack_same_tuple_single_element(%arg0: tuple<tensor<i32>>) -> tupl
   // CHECK: return [[ARG0]]
   return %3 : tuple<tensor<i32>>
 }
+
+// CHECK-LABEL: func @erase_dead_lhlo_constant
+func @erase_dead_lhlo_constant() {
+  %M = alloc() : memref<256x1024xf32>
+  // CHECK-NEXT: return
+  "lmhlo.constant"(%M) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
+  dealloc %M : memref<256x1024xf32>
+  return
+}
+
+// A negative test for dead lhlo constant op erasure.
+// CHECK-LABEL: func @erase_dead_lhlo_constant_negative
+func @erase_dead_lhlo_constant_negative(%M : memref<4xf32>) -> memref<256x1024xf32> {
+  // CHECK-NEXT: lmhlo.constant
+  "lmhlo.constant"(%M) {value = dense<0.0> : tensor<f32>} : (memref<4xf32>) -> ()
+  // CHECK-NEXT: alloc
+  // CHECK-NEXT: lmhlo.constant
+  %N = alloc() : memref<256x1024xf32>
+  "lmhlo.constant"(%N) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
+  return %N : memref<256x1024xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
index 46725e0bd09..aecf612962a 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
@@ -373,6 +373,18 @@ func @reshape_2D_4D(%arg0: tensor<12x42xi32>) -> tensor<12x1x42x1xi32> {
 
 // -----
 
+// CHECK-DAG: #[[RESHAPE_MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[RESHAPE_MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape_3D_4D
+func @reshape_3D_4D(%arg0: tensor<1x49x16xf32>) -> tensor<1x784x1x1xf32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<1x49x16xf32>) -> tensor<1x784x1x1xf32>
+  return %0 : tensor<1x784x1x1xf32>
+}
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[RESHAPE_MAP1]]]
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[RESHAPE_MAP2]]]
+
+// -----
+
 // CHECK-LABEL: func @minf
 func @minf(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %0 = "mhlo.minimum"(%lhs, %rhs)
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
index 768d8da22bd..f174b005a8d 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
@@ -688,6 +688,20 @@ func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
 
 // -----
 
+// CHECK-DAG: #[[RESHAPE_MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[RESHAPE_MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape_3D_4D
+func @reshape_3D_4D(%arg0: memref<1x49x16xf32>, %arg1: memref<1x784x1x1xf32>) {
+  "lmhlo.reshape"(%arg0, %arg1)
+   : (memref<1x49x16xf32>, memref<1x784x1x1xf32>) -> ()
+  return
+}
+// CHECK: linalg.reshape %{{.*}} [#[[RESHAPE_MAP1]]]
+// CHECK: linalg.reshape %{{.*}} [#[[RESHAPE_MAP2]]]
+// CHECK: linalg.copy
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @reverse
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir b/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
index 56a7cf7294c..01ef250efd0 100644
--- a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
@@ -69,7 +69,7 @@ func @sqrt_static(%a: tensor<2x3xf32>) -> tensor<2x3xf32> {
 func @add_unranked(%a : tensor<*xf32>, %b : tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SHAPE_A:.*]] = shape.shape_of %[[A]]
   // CHECK: %[[SHAPE_B:.*]] = shape.shape_of %[[B]]
-  // CHECK: %[[SHAPE:.*]] = "shape.any"(%[[SHAPE_A]], %[[SHAPE_B]])
+  // CHECK: %[[SHAPE:.*]] = shape.any %[[SHAPE_A]], %[[SHAPE_B]]
   // CHECK: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
   // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
   // CHECK: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
index 754469a3c84..69971f4c024 100644
--- a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -30,3 +30,5 @@ add_llvm_executable(mlir-hlo-opt mlir-hlo-opt.cpp
 )
 llvm_update_compile_flags(mlir-hlo-opt)
 target_link_libraries(mlir-hlo-opt PRIVATE ${LIBS})
+
+mlir_check_all_link_libraries(mlir-hlo-opt)
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
index 70fc21d6959..d0c0e3c51e1 100644
--- a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
@@ -13,109 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/MlirOptMain.h"
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> inputFilename(llvm::cl::Positional,
-                                                llvm::cl::desc("<input file>"),
-                                                llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> outputFilename(
-    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
-    llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> splitInputFile(
-    "split-input-file",
-    llvm::cl::desc("Split the input file into pieces and process each "
-                   "chunk independently"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verifyDiagnostics(
-    "verify-diagnostics",
-    llvm::cl::desc("Check that emitted diagnostics match "
-                   "expected-* lines on the corresponding line"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verifyPasses(
-    "verify-each",
-    llvm::cl::desc("Run the verifier after each transformation pass"),
-    llvm::cl::init(true));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> allowUnregisteredDialects(
-    "allow-unregistered-dialect",
-    llvm::cl::desc("Allow operation with no registered dialects"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> showDialects(
-    "show-dialects", llvm::cl::desc("Print the list of registered dialects"),
-    llvm::cl::init(false));
-
 int main(int argc, char **argv) {
-  mlir::registerAllDialects();
   mlir::registerAllPasses();
-
-  mlir::mhlo::registerAllDialects();
   mlir::mhlo::registerAllMhloPasses();
   mlir::lmhlo::registerAllLmhloPasses();
 
-  llvm::InitLLVM y(argc, argv);
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  registry.insert<mlir::mhlo::MhloDialect>();
+  registry.insert<mlir::chlo::HloClientDialect>();
+  registry.insert<mlir::lmhlo::LmhloDialect>();
 
-  // Register any pass manager command line options.
-  mlir::registerPassManagerCLOptions();
-  mlir::PassPipelineCLParser passPipeline("", "Compiler passes to run");
-
-  // Parse pass names in main to ensure static initialization completed.
-  llvm::cl::ParseCommandLineOptions(argc, argv,
-                                    "MLIR modular optimizer driver\n");
-
-  if (showDialects) {
-    mlir::MLIRContext context;
-    llvm::outs() << "Registered Dialects:\n";
-    for (mlir::Dialect *dialect : context.getRegisteredDialects()) {
-      llvm::outs() << dialect->getNamespace() << "\n";
-    }
-    return 0;
-  }
-
-  // Set up the input file.
-  std::string errorMessage;
-  auto file = mlir::openInputFile(inputFilename, &errorMessage);
-  if (!file) {
-    llvm::errs() << errorMessage << "\n";
-    return 1;
-  }
-
-  auto output = mlir::openOutputFile(outputFilename, &errorMessage);
-  if (!output) {
-    llvm::errs() << errorMessage << "\n";
-    exit(1);
-  }
-
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
-                         splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects))) {
-    return 1;
-  }
-  // Keep the output file if the invocation of MlirOptMain was successful.
-  output->keep();
-  return 0;
+  return failed(
+      mlir::MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index bd1dcdf06ea..2d3a58b5b9d 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -29,6 +29,7 @@ filegroup(
         "ir/tfl_ops.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
@@ -227,6 +228,7 @@ cc_library(
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SideEffects",
@@ -500,6 +502,7 @@ gentbl(
     tblgen = "//tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen",
     td_file = "ir/tfl_ops.td",
     td_srcs = [
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "ir/tfl_op_interfaces.td",
@@ -670,6 +673,7 @@ cc_library(
         ":flatbuffer_tflite_operator_lib",
         ":tensorflow_lite",
         ":tensorflow_lite_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
@@ -737,16 +741,13 @@ cc_library(
     ],
     deps = [
         ":flatbuffer_translate_lib",
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirTranslateMain",
         "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
@@ -759,7 +760,7 @@ tf_cc_binary(
     deps = [
         ":flatbuffer_translate_registeration",
         # TODO(b/155809683): Link only necessary dialects.
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
     ],
 )
 
@@ -811,7 +812,7 @@ tf_cc_binary(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         # TODO(b/155809683): Link only necessary dialects.
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
@@ -835,19 +836,18 @@ tf_cc_binary(
     deps = [
         ":flatbuffer_translate_lib",
         ":flatbuffer_translate_registeration",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        # TODO(b/155809683): Link only necessary dialects.
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Support",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -874,7 +874,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core:core_cpu_base",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
@@ -908,7 +908,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 89fae87cb25..34200fb88b6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
@@ -133,63 +134,59 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     return Status(error::INVALID_ARGUMENT,
                   "'isSigned' can only be set for 8-bits integer type");
   }
-  switch (type.getKind()) {
-    case mlir::StandardTypes::F32:
-      return tflite::TensorType_FLOAT32;
-    case mlir::StandardTypes::F16:
-      return tflite::TensorType_FLOAT16;
-    case mlir::StandardTypes::F64:
-      return tflite::TensorType_FLOAT64;
-    case mlir::TF::TensorFlowTypes::STRING:
-      return tflite::TensorType_STRING;
-    case mlir::TF::TensorFlowTypes::QUINT8:
-      return tflite::TensorType_UINT8;
-    case mlir::StandardTypes::Complex: {
-      auto ftype = type.cast<mlir::ComplexType>().getElementType();
-      if (ftype && ftype.isF32()) {
-        return tflite::TensorType_COMPLEX64;
-      }
-      if (ftype && ftype.isF64()) {
-        return tflite::TensorType_COMPLEX128;
-      }
-      return Status(error::INVALID_ARGUMENT, "Unsupported type");
+
+  if (type.isF32()) {
+    return tflite::TensorType_FLOAT32;
+  } else if (type.isF16()) {
+    return tflite::TensorType_FLOAT16;
+  } else if (type.isF64()) {
+    return tflite::TensorType_FLOAT64;
+  } else if (type.isa<mlir::TF::StringType>()) {
+    return tflite::TensorType_STRING;
+  } else if (type.isa<mlir::TF::Quint8Type>()) {
+    return tflite::TensorType_UINT8;
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    auto ftype = complex_type.getElementType();
+    if (ftype.isF32()) {
+      return tflite::TensorType_COMPLEX64;
     }
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          return tflite::TensorType_BOOL;
-        case 8:
-          return itype.isUnsigned() ? tflite::TensorType_UINT8
-                                    : tflite::TensorType_INT8;
-        case 16:
-          return tflite::TensorType_INT16;
-        case 32:
-          return tflite::TensorType_INT32;
-        case 64:
-          return tflite::TensorType_INT64;
-      }
+    if (ftype.isF64()) {
+      return tflite::TensorType_COMPLEX128;
     }
-    case mlir::quant::QuantizationTypes::UniformQuantized: {
-      auto qtype = type.cast<mlir::quant::UniformQuantizedType>();
-      return GetTFLiteType(qtype.getStorageType(), qtype.isSigned());
+    return Status(error::INVALID_ARGUMENT, "Unsupported type");
+  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        return tflite::TensorType_BOOL;
+      case 8:
+        return itype.isUnsigned() ? tflite::TensorType_UINT8
+                                  : tflite::TensorType_INT8;
+      case 16:
+        return tflite::TensorType_INT16;
+      case 32:
+        return tflite::TensorType_INT32;
+      case 64:
+        return tflite::TensorType_INT64;
     }
-    case mlir::quant::QuantizationTypes::UniformQuantizedPerAxis: {
-      auto qtype = type.cast<mlir::quant::UniformQuantizedPerAxisType>();
-      return GetTFLiteType(qtype.getStorageType(), qtype.isSigned());
-    }
-    case mlir::TF::TensorFlowTypes::RESOURCE: {
-      // Treat tf.resource values as integer values in flatbuffer.
-      // TODO(b/146131919): Maybe need to have a detailed design for supporting
-      // other resource types beyonds hash table resources and resource
-      // variables.
-      return tflite::TensorType_INT32;
-    }
-    default:
-      // TFLite export fills FLOAT32 for unknown data types. Returning an error
-      // for now for safety and this could be revisited when required.
-      return Status(error::INVALID_ARGUMENT, "Unsupported type");
+  } else if (auto q_uniform_type =
+                 type.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+    return GetTFLiteType(q_uniform_type.getStorageType(),
+                         q_uniform_type.isSigned());
+
+  } else if (auto q_peraxis_type =
+                 type.dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+    return GetTFLiteType(q_peraxis_type.getStorageType(),
+                         q_peraxis_type.isSigned());
+  } else if (type.isa<mlir::TF::ResourceType>()) {
+    // Treat tf.resource values as integer values in flatbuffer.
+    // TODO(b/146131919): Maybe need to have a detailed design for supporting
+    // other resource types beyonds hash table resources and resource
+    // variables.
+    return tflite::TensorType_INT32;
   }
+  // TFLite export fills FLOAT32 for unknown data types. Returning an error
+  // for now for safety and this could be revisited when required.
+  return Status(error::INVALID_ARGUMENT, "Unsupported type");
 }
 
 static bool IsConst(Operation* op) {
@@ -358,8 +355,13 @@ class Translator {
     if (emit_custom_ops) {
       enabled_op_types_.emplace(OpType::kCustomOp);
     }
-    tf_dialect_ = module.getContext()->getRegisteredDialect("tf");
-    tfl_dialect_ = module.getContext()->getRegisteredDialect("tfl");
+    tf_dialect_ =
+        module.getContext()->getOrLoadDialect<mlir::TF::TensorFlowDialect>();
+    tfl_dialect_ = module.getContext()
+                       ->getOrLoadDialect<mlir::TFL::TensorFlowLiteDialect>();
+    // Right now the TF executor dialect is still needed to build NodeDef.
+    module.getContext()
+        ->getOrLoadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
   }
 
   Optional<std::string> TranslateInternal();
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 3c8bf26aa14..230383729c4 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -254,20 +255,35 @@ mlir::Operation* ConvertMinMaxToStatsOp(const TensorT& tensor, OpBuilder b,
                                              layer_stats, axis_stats, axis);
 }
 
-StatusOr<std::string> OpNameForOpCode(const tflite::OperatorCodeT opcode) {
-  if (opcode.builtin_code == tflite::BuiltinOperator_CUSTOM) {
+// Returns true if this is a basic LSTM op.
+bool IsBasicLSTMOp(tflite::BuiltinOptionsUnion op_union) {
+  if (const auto* op = op_union.AsLSTMOptions()) {
+    return op->kernel_type == tflite::LSTMKernelType_BASIC;
+  } else {
+    return false;
+  }
+}
+
+// Gets the MLIR op name with the dialect name for the flatbuffer operator.
+StatusOr<std::string> GetMlirOpName(const tflite::OperatorT& op,
+                                    const tflite::OperatorCodeT& op_code) {
+  if (IsBasicLSTMOp(op.builtin_options)) {
+    return std::string("tfl.basic_lstm");
+  }
+
+  if (op_code.builtin_code == tflite::BuiltinOperator_CUSTOM) {
     return std::string("tfl.custom");
   }
-  if (opcode.builtin_code == tflite::BuiltinOperator_IF) {
+  if (op_code.builtin_code == tflite::BuiltinOperator_IF) {
     return std::string("tf.If");
   }
-  if (opcode.builtin_code == tflite::BuiltinOperator_WHILE) {
+  if (op_code.builtin_code == tflite::BuiltinOperator_WHILE) {
     return std::string("tf.While");
   }
 
-  const char* op_name = tflite::EnumNameBuiltinOperator(opcode.builtin_code);
-  std::string lowered_name = llvm::StringRef(op_name).lower();
-  return llvm::Twine("tfl.", lowered_name).str();
+  llvm::StringRef op_name(
+      tflite::EnumNameBuiltinOperator(op_code.builtin_code));
+  return llvm::Twine("tfl.", op_name.lower()).str();
 }
 
 // The buffers in TFLite flatbuffers have their contents stored as a vector of
@@ -464,7 +480,7 @@ StatusOr<Operation*> BuildConstOp(const tflite::TensorT& tensor,
 
     value = mlir::DenseStringElementsAttr::get(shaped_type, refs);
   } else if (elem_type.isa<mlir::ComplexType, mlir::TF::TensorFlowType>()) {
-    auto dialect = elem_type.getContext()->getRegisteredDialect("tf");
+    auto dialect = elem_type.getContext()->getLoadedDialect("tf");
     tensorflow::TensorProto repr = ConvertTfliteConstTensor(tensor, buffer);
     std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
 
@@ -510,14 +526,6 @@ llvm::SmallVector<mlir::NamedAttribute, 4> ConvertSubgraphIdxsToFunctionAttrs(
   return {};
 }
 
-// Returns true if this is a basic LSTM op.
-bool IsBasicLSTMOp(tflite::BuiltinOptionsUnion op_union) {
-  if (const auto* op = op_union.AsLSTMOptions()) {
-    return op->kernel_type == tflite::LSTMKernelType_BASIC;
-  } else {
-    return false;
-  }
-}
 
 // TODO(krzysd) Handle function calls
 StatusOr<Operation*> ConvertOp(
@@ -525,7 +533,6 @@ StatusOr<Operation*> ConvertOp(
     const std::vector<mlir::TensorType>& intermediate_types,
     Value optional_arg_marker,
     const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
-    const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::TensorT>>& tensors, Location loc,
     OpBuilder builder) {
@@ -537,10 +544,10 @@ StatusOr<Operation*> ConvertOp(
     return emitError(loc, err.ToString()), err;
   }
 
-  const bool is_basic_lstm = IsBasicLSTMOp(op.builtin_options);
-  const tflite::OperatorCodeT op_code = *op_codes.at(op.opcode_index);
-  const std::string& op_name =
-      is_basic_lstm ? "tfl.basic_lstm" : op_names.at(op.opcode_index);
+  const tflite::OperatorCodeT& op_code = *op_codes.at(op.opcode_index);
+
+  TF_ASSIGN_OR_RETURN(const std::string op_name, GetMlirOpName(op, op_code));
+
   OperationState op_state(loc, op_name);
 
   for (auto input_num : op.inputs) {
@@ -791,8 +798,7 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
 }
 
 // Build a FuncOp from a tflite SubGraph
-// The op_names are a mapping from indexes into the TFLite operators array to
-// the operator name MLIR expects (tfl.foo_op). The buffers are directly taken
+// The buffers are directly taken
 // from the deserialized flatbuffer as we do not have the type information to
 // interpret them until this point. The base_loc parameter is the location of
 // the flatbuffer as a whole (usually a file). The is_entry_point flag
@@ -802,7 +808,6 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
 StatusOr<FuncOp> ConvertSubgraph(
     const tflite::SubGraphT& subgraph, llvm::StringRef name,
     const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
-    const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
     Location base_loc, Builder builder, bool is_entry_point,
@@ -1002,8 +1007,7 @@ StatusOr<FuncOp> ConvertSubgraph(
     TF_ASSIGN_OR_RETURN(
         auto* mlir_op,
         ConvertOp(*op, vals_map, intermediate_types, maybe_optional_arg_marker,
-                  op_codes, op_names, func_names, subgraph.tensors, op_loc,
-                  op_builder));
+                  op_codes, func_names, subgraph.tensors, op_loc, op_builder));
 
     // Add the results to the value maps. There are two cases: 1. the result
     // tensor does not have min/max values, the original op result is used
@@ -1069,6 +1073,10 @@ OwningModuleRef tflite::FlatBufferToMlir(
     const std::vector<std::string>& ordered_input_arrays,
     const std::vector<std::string>& ordered_output_arrays,
     bool experimental_prune_unreachable_nodes_unconditionally) {
+  context->loadDialect<
+      mlir::StandardOpsDialect, mlir::quant::QuantizationDialect,
+      mlir::TFL::TensorFlowLiteDialect, mlir::TF::TensorFlowDialect>();
+
   auto model_ptr =
       FlatBufferModel::VerifyAndBuildFromBuffer(buffer.data(), buffer.length());
   if (nullptr == model_ptr) {
@@ -1079,17 +1087,6 @@ OwningModuleRef tflite::FlatBufferToMlir(
 
   auto builder = Builder(context);
 
-  std::vector<std::string> operator_names;
-  operator_names.reserve(model->operator_codes.size());
-
-  for (auto& opcode : model->operator_codes) {
-    auto operator_name_or_error = OpNameForOpCode(*opcode);
-    if (!operator_name_or_error.ok()) {
-      return emitError(base_loc, operator_name_or_error.status().ToString()),
-             nullptr;
-    }
-    operator_names.push_back(operator_name_or_error.ConsumeValueOrDie());
-  }
 
   std::vector<std::string> func_names;
   for (auto& subgraph : model->subgraphs) {
@@ -1110,8 +1107,8 @@ OwningModuleRef tflite::FlatBufferToMlir(
     auto& subgraph = e.value();
     std::string name = SubgraphName(e.index(), *subgraph);
     auto func_or_error = ConvertSubgraph(
-        *subgraph, name, model->operator_codes, operator_names, func_names,
-        model->buffers, base_loc, builder,
+        *subgraph, name, model->operator_codes, func_names, model->buffers,
+        base_loc, builder,
         // TODO(b/131175224,b/132239787) Support multiple entry points
         /*is_entry_point=*/e.index() == 0,
         /*use_external_constant=*/use_external_constant, ordered_input_arrays,
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index ceaa4e215cf..5accb419e83 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -95,40 +95,34 @@ static tflite::MirrorPadMode ConvertTFL_MirrorPaddingAttrForOptionWriter(
 
 static tflite::TensorType ConvertDerivedTypeAttrForOptionWriter(
     mlir::Type type, flatbuffers::FlatBufferBuilder* builder) {
-  switch (type.getKind()) {
-    case mlir::StandardTypes::F16:
-      return tflite::TensorType_FLOAT16;
-    case mlir::StandardTypes::F32:
-      return tflite::TensorType_FLOAT32;
-    case mlir::TF::TensorFlowTypes::STRING:
-      return tflite::TensorType_STRING;
-    case mlir::StandardTypes::Complex: {
-      auto etype = type.cast<mlir::ComplexType>().getElementType();
-      if (etype.isF32()) {
-        return tflite::TensorType_COMPLEX64;
-      }
-      llvm_unreachable("invalid complex Type in conversion");
+  if (type.isF16()) {
+    return tflite::TensorType_FLOAT16;
+  } else if (type.isF32()) {
+    return tflite::TensorType_FLOAT32;
+  } else if (type.isa<mlir::TF::StringType>()) {
+    return tflite::TensorType_STRING;
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    if (complex_type.getElementType().isF32()) {
+      return tflite::TensorType_COMPLEX64;
     }
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          return tflite::TensorType_BOOL;
-        case 8:
-          return tflite::TensorType_INT8;
-        case 16:
-          return tflite::TensorType_INT16;
-        case 32:
-          return tflite::TensorType_INT32;
-        case 64:
-          return tflite::TensorType_INT64;
-        default:
-          llvm_unreachable("invalid integer Type in conversion");
-      }
+    llvm_unreachable("invalid complex Type in conversion");
+  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        return tflite::TensorType_BOOL;
+      case 8:
+        return tflite::TensorType_INT8;
+      case 16:
+        return tflite::TensorType_INT16;
+      case 32:
+        return tflite::TensorType_INT32;
+      case 64:
+        return tflite::TensorType_INT64;
+      default:
+        llvm_unreachable("invalid integer Type in conversion");
     }
-    default:
-      llvm_unreachable("invalid Type in conversion");
   }
+  llvm_unreachable("invalid Type in conversion");
 }
 
 // I32Attr already returns an int as required by flatbuffer builders.
@@ -255,7 +249,7 @@ Status mlir::CustomOptionsToAttributes(
       {static_cast<int64_t>(custom_options.size())}, builder.getIntegerType(8));
   attributes->emplace_back(builder.getNamedAttr(
       "custom_option",
-      OpaqueElementsAttr::get(builder.getContext()->getRegisteredDialect("tfl"),
+      OpaqueElementsAttr::get(builder.getContext()->getLoadedDialect("tfl"),
                               type, content)));
 
   return Status::OK();
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index 5b95b30a96c..94f7e2261f7 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -33,6 +34,8 @@ limitations under the License.
 #include "mlir/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 
 using llvm::cl::opt;
@@ -175,5 +178,11 @@ static TranslateToMLIRRegistration FlatBufferFileToMlirTransReg(
     });
 
 static TranslateFromMLIRRegistration MLIRToFlatBufferTranslate(
-    "mlir-to-tflite-flatbuffer", MlirToFlatBufferFileTranslateFunction);
+    "mlir-to-tflite-flatbuffer", MlirToFlatBufferFileTranslateFunction,
+    [](DialectRegistry& registry) {
+      registry.insert<quant::QuantizationDialect>();
+      registry.insert<TF::TensorFlowDialect>();
+      registry.insert<TFL::TensorFlowLiteDialect>();
+      registry.insert<StandardOpsDialect>();
+    });
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index b5fcd5e82e2..403b3dd18ad 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -253,9 +254,8 @@ struct TensorFlowLiteInlinerInterface : public DialectInlinerInterface {
   }
 };
 
-struct TensorFlowLiteOpFolderDialectInterface
-    : public OpFolderDialectInterface {
-  using OpFolderDialectInterface::OpFolderDialectInterface;
+struct TensorFlowLiteDialectFoldInterface : public DialectFoldInterface {
+  using DialectFoldInterface::DialectFoldInterface;
 
   // Registered hook to check if the given region, which is attached to an
   // operation that is *not* isolated from above (i.e. no internal regions
@@ -275,7 +275,7 @@ TensorFlowLiteDialect::TensorFlowLiteDialect(mlir::MLIRContext *context)
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
       >();
   addInterfaces<TensorFlowLiteInlinerInterface,
-                TensorFlowLiteOpFolderDialectInterface>();
+                TensorFlowLiteDialectFoldInterface>();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1028,9 +1028,12 @@ static LogicalResult Verify(PackOp op) {
   // Check axis bounds.
   if (input_type.hasRank()) {
     int64_t axis_value = op.axis().getSExtValue();
-    if (abs(axis_value) > input_type.getRank())
-      return op.emitOpError("op attribute 'axis' is out of bounds, got ")
-             << axis_value;
+    if (axis_value < 0) axis_value += input_type.getRank() + 1;
+    if (axis_value < 0 || axis_value >= input_type.getRank() + 1)
+      return op.emitOpError()
+             << "op attribute 'axis' should be in range [-rank - 1, rank + 1), "
+             << "got rank = " << input_type.getRank()
+             << ", and axis = " << op.axis().getSExtValue();
   }
 
   // Make sure all inputs have the same shape and element type.
@@ -1443,12 +1446,59 @@ void FakeQuantOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 
 // TODO(b/133486129): Implement shape inference for unpack
 
-static LogicalResult Verify(UnpackOp op) {
-  // TODO(antiagainst): Implement other checks as in
-  // tensorflow/lite/kernels/unpack.cc
+LogicalResult UnpackOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> loc, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  UnpackOpAdaptor op(operands, attributes);
+  // TODO(jpienaar): Refactor verify
+  if (failed(op.verify(loc.hasValue() ? *loc : UnknownLoc::get(context))))
+    return failure();
 
-  if (op.getOperation()->getNumResults() != op.num())
-    return op.emitOpError("output count should match 'num' attribute");
+  if (operands.size() != 1) {
+    return emitOptionalError(loc, "input count should be equal to 1");
+  }
+
+  const int64_t num_value = op.num().getInt();
+  auto input_type = operands[0].getType().dyn_cast<ShapedType>();
+  if (!input_type || !input_type.hasRank()) {
+    // If input is unranked, then so is output.
+    inferredReturnTypes.assign(
+        num_value, UnrankedTensorType::get(input_type.getElementType()));
+    return success();
+  }
+
+  if (input_type.hasStaticShape() && input_type.getNumElements() <= 0) {
+    return emitOptionalError(
+        loc, "number of elements in input shoule be larger than 0");
+  }
+
+  const int64_t rank = input_type.getRank();
+  if (rank <= 0) {
+    return emitOptionalError(loc, "input should be of rank larger than 0");
+  }
+
+  int64_t axis_value = op.axis().getInt();
+  if (axis_value < 0) {
+    axis_value += rank;
+  }
+  if (axis_value < 0 || axis_value >= rank) {
+    return emitOptionalError(
+        loc, "attribute 'axis' should be in range [-rank, rank), got axis = ",
+        op.axis().getInt(), ", and rank = ", rank);
+  }
+
+  if (!ShapedType::isDynamic(input_type.getDimSize(axis_value)) &&
+      input_type.getDimSize(axis_value) != num_value) {
+    return emitOptionalError(loc, "output count should match 'num' attribute");
+  }
+
+  auto output_shape = llvm::to_vector<4>(input_type.getShape());
+  output_shape.erase(output_shape.begin() + axis_value);
+
+  auto output_type =
+      RankedTensorType::get(output_shape, input_type.getElementType());
+  inferredReturnTypes.assign(num_value, output_type);
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index caed0bb3ad9..d2d8442155b 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 6dc9fda656f..f1cdfec631d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -19,6 +19,7 @@ limitations under the License.
 #define TFL_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
@@ -107,7 +108,11 @@ def OpaqueBytesAttr : ElementsAttrBase<
           ".getElementType().isInteger(8)">,
   ]>,
   "opaque bytes attribute"
- >;
+ > {
+  let storageType = [{ OpaqueElementsAttr }];
+  let returnType = [{ OpaqueElementsAttr }];
+  let convertFromStorage = "$_self";
+}
 
 //===----------------------------------------------------------------------===//
 // Derived shape attribute class.
@@ -3024,7 +3029,8 @@ def TFL_TransposeOp : TFL_Op<"transpose", [
 def TFL_UnpackOp : TFL_Op<"unpack", [
     NoSideEffect,
     SameOperandsAndResultElementType,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultsScale,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Unpacks a tensor along a dimension into multiple tensors";
 
   let description = [{
@@ -3047,7 +3053,7 @@ def TFL_UnpackOp : TFL_Op<"unpack", [
   let arguments = (ins
     TFL_TensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$input,
 
-    I32Attr:$num,
+    Confined<I32Attr, [IntNonNegative]>:$num,
     I32Attr:$axis
   );
 
@@ -3055,8 +3061,6 @@ def TFL_UnpackOp : TFL_Op<"unpack", [
     TFL_VariadicTensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$outputs
   );
 
-  let verifier = [{ return Verify(*this); }];
-
   let hasOptions = 1;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
index 0d42fbb9646..35a58a01a29 100644
--- a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
+++ b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
@@ -30,12 +30,16 @@ limitations under the License.
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
@@ -98,6 +102,10 @@ int main(int argc, char** argv) {
 
   // Load the MLIR module.
   mlir::MLIRContext context;
+  context.getDialectRegistry()
+      .insert<mlir::TF::TensorFlowDialect, mlir::TFL::TensorFlowLiteDialect,
+              mlir::StandardOpsDialect>();
+
   llvm::SourceMgr source_mgr;
   source_mgr.AddNewSourceBuffer(std::move(*file_or_err), llvm::SMLoc());
   mlir::OwningModuleRef module(mlir::parseSourceFile(source_mgr, &context));
diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index 6299a70b1df..7e7d4678a87 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -62,6 +62,10 @@ class ImportQuantStatsPass
 
   void runOnFunction() override;
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<quant::QuantizationDialect>();
+  }
+
   // Parses the serialized quant stats protobuf and initialize the internal
   // data structure. This method must be called after the pass is created.
   bool ParseQuantStats(const std::string &stats_str);
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 31c0e4cb8a9..38c7ad86e05 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -74,6 +75,6 @@ tf_cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index a2e3c065113..238710bcf13 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
@@ -52,6 +53,7 @@ TfLiteStatus QuantizeModel(
   }
 
   MLIRContext context;
+  context.getDialectRegistry().insert<mlir::TFL::TensorFlowLiteDialect>();
   StatusScopedDiagnosticHandler statusHandler(&context,
                                               /*propagate=*/true);
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 9e0ad990657..16b51496b5f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -99,12 +99,14 @@ class QuantizationDriver {
  public:
   explicit QuantizationDriver(FuncOp fn, bool is_signed,
                               bool disable_per_channel,
-                              OpQuantSpecGetter op_quant_spec_getter)
+                              OpQuantSpecGetter op_quant_spec_getter,
+                              bool enforce_fixed_output_range)
       : fn_(fn),
         builder_(fn.getBody()),
         is_signed_(is_signed),
         disable_per_channel_(disable_per_channel),
-        op_quant_spec_getter_(op_quant_spec_getter) {}
+        op_quant_spec_getter_(op_quant_spec_getter),
+        enforce_fixed_output_range_(enforce_fixed_output_range) {}
 
   // The entry point of the quantization parameters propagation.
   void Run();
@@ -354,6 +356,8 @@ class QuantizationDriver {
   llvm::SmallVector<BlockArgument, 4> args_;
 
   OpQuantSpecGetter op_quant_spec_getter_;
+
+  bool enforce_fixed_output_range_;
 };
 }  // namespace
 
@@ -794,7 +798,8 @@ bool QuantizationDriver::PropagateParams() {
     }
 
     // TODO(fengliuai): make the bit width configurable.
-    if (auto restricted = llvm::dyn_cast<FixedOutputRangeInterface>(op)) {
+    auto restricted = llvm::dyn_cast<FixedOutputRangeInterface>(op);
+    if (restricted && enforce_fixed_output_range_) {
       // TODO(fengliuai): different result can have different fixed range.
       auto params = restricted.GetFixedOutputRange(is_signed_, /*bit_width=*/8);
       for (auto i = 0; i < op->getNumResults(); ++i) {
@@ -864,10 +869,12 @@ void QuantizationDriver::Run() {
   }
 }
 
-void ApplyQuantizationParamsPropagation(
-    mlir::FuncOp func, bool is_signed, bool disable_per_channel,
-    OpQuantSpecGetter op_quant_spec_getter) {
-  QuantizationDriver(func, is_signed, disable_per_channel, op_quant_spec_getter)
+void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed,
+                                        bool disable_per_channel,
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool post_training_quantization) {
+  QuantizationDriver(func, is_signed, disable_per_channel, op_quant_spec_getter,
+                     post_training_quantization)
       .Run();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 07e5ba4e879..6e356acbbdf 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -490,9 +490,13 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
 // and the propagation results are materialized by inserting pairs of quantize
 // and dequantize ops to this function. Set `disable_per_channel` to true to not
 // use per channel quantization even the op supports it.
+// Setting `enforce_fixed_output_range` to true, to infer quantization
+// parameters from the fixed output range ops. This is only used for
+// post-training quantization.
 void ApplyQuantizationParamsPropagation(mlir::FuncOp func, bool is_signed,
                                         bool disable_per_channel,
-                                        OpQuantSpecGetter op_quant_spec_getter);
+                                        OpQuantSpecGetter op_quant_spec_getter,
+                                        bool enforce_fixed_output_range);
 
 // The function might contain more stats ops than required, and it will
 // introduce requantize if the calibration stats have conflicts. This method
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
index f482e3db6b9..a7f6040f211 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/if_op.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=4:4 -tf-output-arrays=StatefulIf,StatelessIf %s -o - --output-mlir | FileCheck %s
+# RUN: tf_tfl_translate -tf-input-arrays=a,b -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-input-shapes=: -tf-output-arrays=StatefulIf,StatelessIf %s -o - --output-mlir | FileCheck %s
 node {
   name: "tf.Less"
   op: "Less"
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
deleted file mode 100644
index 7e9f66baa90..00000000000
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unknown-op.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: not flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - 2>&1 | FileCheck %s
-
-func @main(tensor<3x2xi32>) -> tensor<3x2xi32> {
-^bb0(%arg0: tensor<3x2xi32>):
-  // CHECK: error: 'unknown_op' op dialect is not registered
-  %0 = "unknown_op"(%arg0) : (tensor<3x2xi32>) -> tensor<3x2xi32>
-  return %0 : tensor<3x2xi32>
-}
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 7ef6997f938..cbb562c2e03 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1139,9 +1139,15 @@ func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x
 
 // -----
 
-func @packNegInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<2x1x4xi32> {
+func @packNegInputAxis2(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x2x4xi32> {
   // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32}
-  %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<2x1x4xi32>
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x2x4xi32>
+  return %0 : tensor<1x2x4xi32>
+}
+
+func @packNegInputAxis3(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<2x1x4xi32> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -3 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = -3 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<2x1x4xi32>
   return %0 : tensor<2x1x4xi32>
 }
 
@@ -1172,7 +1178,7 @@ func @pack(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
-  // expected-error @+1 {{op attribute 'axis' is out of bounds, got 3}}
+  // expected-error @+1 {{op attribute 'axis' should be in range [-rank - 1, rank + 1), got rank = 1, and axis = 3}}
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 3 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -1183,7 +1189,22 @@ func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
   // CHECK: "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32}
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   return %0#0 : tensor<2xi32>
+}
 
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // CHECK: "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32}
+  %0:3 = "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<3xi32> {
+  // CHECK: "tfl.unpack"(%arg0) {axis = -2 : i32, num = 2 : i32}
+  %0:2 = "tfl.unpack"(%arg0) {axis = -2 : i32, num = 2 : i32} : (tensor<2x3xi32>) -> (tensor<3xi32>, tensor<3xi32>)
+  return %0#0 : tensor<3xi32>
 }
 
 // -----
@@ -1204,6 +1225,45 @@ func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 
 // -----
 
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+1 {{attribute 'axis' should be in range [-rank, rank), got axis = 2, and rank = 2}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = 2 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+1 {{attribute 'axis' should be in range [-rank, rank), got axis = -3, and rank = 2}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = -3 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<i32>) -> tensor<2xi32> {
+  // expected-error @+1 {{input should be of rank larger than 0}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = 0 : i32, num = 3 : i32} : (tensor<i32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
+  // expected-error @+1 {{op inferred type incompatible with return type of operation}}
+  %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2x1xi32>, tensor<2xi32>)
+  return %0#0 : tensor<2xi32>
+}
+
+// -----
+
+func @unpack(%arg0: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
+  %0:2 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 2 : i32} : (tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
+  return %0#0, %0#1 : tensor<*xi32>, tensor<*xi32>
+}
+
+// -----
+
 // CHECK-LABEL: testMean
 func @testMean(%arg0: tensor<2x2xf32>, %arg1 : tensor<1xi32>) -> tensor<1x2xf32> {
   // CHECK: "tfl.mean"(%arg0, %arg1) {keep_dims = false}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 7923c82ba92..edbcef3d321 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -1115,3 +1115,63 @@ func @ConvertIdentityScatterNd(%arg0: tensor<4x3xf32>) -> tensor<4x3xf32> {
 // CHECK-SAME: (%[[ARG:.*]]: tensor<4x3xf32>) -> tensor<4x3xf32>
 // CHECK-NEXT: return %[[ARG]] : tensor<4x3xf32>
 }
+
+func @ReshapeAddUnknownShape(%arg0: tensor<*xf32>) -> tensor<3x4xf32> {
+  %cst = constant dense<[3, 4]> : tensor<2xi32>
+  %cst_0 = constant dense<1.000000e+00> : tensor<3x4xf32>
+  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<*xf32>, tensor<2xi32>) -> tensor<3x4xf32>
+  %1 = "tfl.add"(%0, %cst_0) {fused_activation_function = "NONE"} : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %1 : tensor<3x4xf32>
+// CHECK-LABEL: ReshapeAddUnknownShape
+// CHECK: %[[rs1:.*]] = "tfl.reshape"(%arg0
+// CHECK: %[[rs2:.*]] = tfl.add %[[rs1]]
+// CHECK: return %[[rs2]]
+}
+
+func @FoldSumKeepDim(%arg0: tensor<8x128xf32>) -> tensor<8x1xf32> {
+  %cst = constant dense<1> : tensor<1xi32>
+  %cst_1 = constant dense<[8, 1]> : tensor<2xi32>
+  %0 = "tfl.sum"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<8xf32>, tensor<2xi32>) -> tensor<8x1xf32>
+  return %1 : tensor<8x1xf32>
+
+// CHECK-LABEL: FoldSumKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.sum"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+// CHECK: return %[[RESULT]] : tensor<8x1xf32>
+}
+
+func @FoldReduceMinKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
+  %cst = constant dense<0> : tensor<1xi32>
+  %cst_1 = constant dense<[1, 128]> : tensor<2xi32>
+  %0 = "tfl.reduce_min"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<128xf32>, tensor<2xi32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+// CHECK-LABEL: FoldReduceMinKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_min"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: return %[[RESULT]] : tensor<1x128xf32>
+}
+
+func @FoldReduceMaxKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
+  %cst = constant dense<0> : tensor<1xi32>
+  %cst_1 = constant dense<[1, 128]> : tensor<2xi32>
+  %0 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<128xf32>, tensor<2xi32>) -> tensor<1x128xf32>
+  return %1 : tensor<1x128xf32>
+
+// CHECK-LABEL: FoldReduceMaxKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_max"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: return %[[RESULT]] : tensor<1x128xf32>
+}
+
+func @FoldReduceProdKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x1xf32> {
+  %cst = constant dense<[0, 1]> : tensor<2xi32>
+  %cst_1 = constant dense<[1, 1]> : tensor<2xi32>
+  %0 = "tfl.reduce_prod"(%arg0, %cst) {keep_dims = false} : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<f32>
+  %1 = "tfl.reshape"(%0, %cst_1) : (tensor<f32>, tensor<2xi32>) -> tensor<1x1xf32>
+  return %1 : tensor<1x1xf32>
+
+// CHECK-LABEL: FoldReduceProdKeepDim
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_prod"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<1x1xf32>
+// CHECK: return %[[RESULT]] : tensor<1x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 6ee5b67d65e..6a992d6dfe4 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -615,4 +615,18 @@ func @broadcast_to_i32(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3
 // CHECK:  return [[MUL]] : tensor<3x3xi32>
 }
 
+// CHECK-LABEL: lower_rfft_to_rfft2d
+func @lower_rfft_to_rfft2d(%input: tensor<10x20x30xf32>, %fft_len: tensor<1xi32>) -> tensor<10x20x30xcomplex<f64>> {
+  %0 = "tf.RFFT"(%input, %fft_len) : (tensor<10x20x30xf32>, tensor<1xi32>) -> tensor<10x20x30xcomplex<f64>>
+  return %0: tensor<10x20x30xcomplex<f64>>
+
+// CHECK:  %[[CST:.*]] = constant dense<-2> : tensor<i32>
+// CHECK:  %[[CST0:.*]] = constant dense<1> : tensor<1xi32>
+// CHECK:  %[[CST1:.*]] = constant dense<0> : tensor<i32>
+// CHECK:  %[[EXP:.*]] = "tf.ExpandDims"(%arg0, %[[CST]]) : (tensor<10x20x30xf32>, tensor<i32>) -> tensor<10x20x1x30xf32>
+// CHECK:  %[[CON:.*]] = "tf.ConcatV2"(%[[CST0]], %arg1, %[[CST1]]) : (tensor<1xi32>, tensor<1xi32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:  %[[RFF:.*]] = "tf.RFFT2D"(%[[EXP]], %[[CON]]) : (tensor<10x20x1x30xf32>, tensor<2xi32>) -> tensor<10x20x1x30xcomplex<f64>>
+// CHECK:  %[[SQE:.*]] = "tf.Squeeze"(%[[RFF]]) {squeeze_dims = [-2]} : (tensor<10x20x1x30xcomplex<f64>>) -> tensor<10x20x30xcomplex<f64>>
+}
+
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index eeecfac67cf..d28ee4b31fa 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -37,8 +38,10 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -103,7 +106,8 @@ bool OperandsBroadcastToOutputType(Type a, Type b, Type expected_output) {
 bool IsTailOfShape(Type type1, Type type2) {
   auto tail_type = type1.dyn_cast<ShapedType>();
   auto full_type = type2.dyn_cast<ShapedType>();
-  if (!tail_type || !full_type || tail_type.getRank() > full_type.getRank())
+  if (!tail_type || !full_type || !tail_type.hasRank() ||
+      !full_type.hasRank() || tail_type.getRank() > full_type.getRank())
     return false;
   auto i1 = tail_type.getShape().rbegin(), e1 = tail_type.getShape().rend();
   auto i2 = full_type.getShape().rbegin();
@@ -244,6 +248,38 @@ static Type GetShapeStrippedType(TypeAttr type_attr) {
   }
 }
 
+// Returns `true` if reducing `axes` in `input` with `keep_dims=true` results in
+// the specified `shape` and `false` otherwise.
+static bool ShapeMatchesReduceWithKeepAxes(Value input,
+                                           const mlir::Attribute &axes,
+                                           const mlir::Attribute &shape) {
+  RankedTensorType type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  if (!type) return false;
+
+  DenseIntElementsAttr axes_attr =
+      axes.dyn_cast_or_null<DenseIntElementsAttr>();
+  DenseIntElementsAttr shape_attr =
+      shape.dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!axes_attr || !shape_attr) return false;
+
+  if (shape_attr.getNumElements() != type.getRank()) return false;
+
+  llvm::SmallSet<uint64_t, 4> axes_set;
+  for (auto a : axes_attr.getIntValues()) {
+    axes_set.insert(a.getZExtValue());
+  }
+
+  auto type_shape = type.getShape();
+  for (uint64_t i = 0; i < type.getRank(); ++i) {
+    if (axes_set.contains(i)) {
+      if (shape_attr.getValue<APInt>({i}) != 1) return false;
+    } else {
+      if (shape_attr.getValue<APInt>({i}) != type_shape[i]) return false;
+    }
+  }
+  return true;
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
 // Fuse Add with proceeding FullyConnected.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 3c5fc7a0c5e..559d22dcf47 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -535,4 +535,20 @@ def OptimizeIdentityScatterNdOp : Pat<
   (replaceWithValue $params),
   [(CanOptimizeIdentityGatherNdOrScatterNdOp $params, $indices)]>;
 
+def ShapeMatchesReduceWithKeepAxes : Constraint<CPred<
+  "ShapeMatchesReduceWithKeepAxes($0, $1, $2)">>;
+
+// Fold reshapes re-inserting reduced dimensions into the results of a reduction
+// with `keep_dims=false` by chaning it to one using `keep_dims=true`.
+foreach ReduceOp = [TFL_ReduceMaxOp, TFL_ReduceMinOp, TFL_ReduceProdOp,
+                    TFL_SumOp] in {
+  def FoldReshapeTo#ReduceOp : Pat<
+    (TFL_ReshapeOp
+      (ReduceOp:$reduce $input, (ConstantOp I32ElementsAttr: $axes),
+                        ConstBoolAttrFalse),
+      (ConstantOp I32ElementsAttr: $shape)),
+    (ReduceOp $input, (ConstantOp $axes), ConstBoolAttrTrue),
+    [(ShapeMatchesReduceWithKeepAxes $input, $axes, $shape),
+     (HasOneUse $reduce)]>;
+}
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 9a27d0de62a..07b7aacd95d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -122,6 +123,10 @@ class PrepareQuantizePass
   // the best quantization practise. This also fixes some simple violations.
   void SanityCheckAndAdjustment(FuncOp func);
 
+  // Whether the func contains Quantize ops. This is used to determine whether
+  // to use the quantization parameters from the fixed output range property.
+  bool ContainsQuantizeOps(FuncOp func);
+
   QuantizationSpecs quant_specs_;
 };
 
@@ -285,6 +290,13 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
   });
 }
 
+bool PrepareQuantizePass::ContainsQuantizeOps(FuncOp func) {
+  for (const auto& op : func.getOps()) {
+    if (llvm::isa<quant::DequantizeCastOp>(op)) return true;
+  }
+  return false;
+}
+
 using PrepareQuantStats =
     quant::ConvertStatsToQDQs<quant::QuantizeCastOp, quant::DequantizeCastOp>;
 
@@ -309,6 +321,7 @@ void PrepareQuantizePass::runOnFunction() {
   OwningRewritePatternList patterns;
   bool is_signed = quant_specs_.IsSignedInferenceType();
   int bit_width = quant_specs_.GetQuantizationTypeWidth();
+  bool enforce_fixed_output_range = ContainsQuantizeOps(func);
   if (is_signed) {
     patterns.insert<quant::ConvertUnsignedToSigned<quant::QuantizeCastOp>>(ctx);
     // Convert quant stats to int8 quantization parameters.
@@ -327,7 +340,8 @@ void PrepareQuantizePass::runOnFunction() {
   // values (tensors).
   ApplyQuantizationParamsPropagation(
       func, is_signed, disable_per_channel || quant_specs_.disable_per_channel,
-      GetOpQuantSpec);
+      GetOpQuantSpec,
+      enforce_fixed_output_range || quant_specs_.post_training_quantization);
 
   ConvertMlirQuantOpsToTFLQuantOps(func);
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 918c3c69c93..c521ca0ed53 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -84,6 +85,11 @@ class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
       : unfold_batch_matmul_(unfold_batch_matmul) {}
   void runOnFunction() override;
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mhlo::MhloDialect, quant::QuantizationDialect,
+                    TFL::TensorFlowLiteDialect>();
+  }
+
  private:
   bool unfold_batch_matmul_;
 };
@@ -706,10 +712,8 @@ struct ConvertTFBroadcastTo : public RewritePattern {
            shape_type.getDimSize(0) <= 5)))
       return failure();
 
-    if (!((element_type.getKind() == mlir::StandardTypes::F32) ||
-          (element_type.getKind() == mlir::StandardTypes::BF16) ||
-          (element_type.getKind() == mlir::StandardTypes::Integer &&
-           element_type.cast<mlir::IntegerType>().getWidth() == 32)))
+    if (!(element_type.isa<BFloat16Type, Float32Type>() ||
+          element_type.isInteger(32)))
       return failure();
 
     auto status_or_const_op =
@@ -762,6 +766,102 @@ LogicalResult ConvertTf2XlaOps(FuncOp func, MLIRContext *context) {
   return applyPartialConversion(func, target, patterns);
 }
 
+// Convert rfft to rfft2d.
+// The transformation pattern looks like below:
+//
+//    input     fft_len
+//     \      /
+//     rfft
+//
+//     ||
+//     \/
+//
+//   input       fft_len
+//    \            /
+//   expand_dim    concat with [1] at the front
+//      \         /
+//     rfft_2d
+//       |
+//     squeeze
+struct ConvertRfftToRfft2d : public RewritePattern {
+  explicit ConvertRfftToRfft2d(MLIRContext *context)
+      : RewritePattern(TF::RFFTOp::getOperationName(), 1, context) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto rfft_op = dyn_cast<TF::RFFTOp>(op);
+
+    auto input = rfft_op.input();
+    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!input_type) return failure();
+    auto fft_len = rfft_op.fft_length();
+    auto fft_len_type = fft_len.getType().dyn_cast_or_null<ShapedType>();
+    if (!fft_len_type) return failure();
+
+    auto output_type =
+        rfft_op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+    if (!output_type) return failure();
+
+    // Expanded inputs.
+    // Insert at -2 location.
+    auto one_ele_type =
+        mlir::RankedTensorType::get({1}, rewriter.getIntegerType(32));
+    auto minus_two = CreateConstOpWithSingleValue(&rewriter, rfft_op.getLoc(),
+                                                  one_ele_type, -2);
+
+    SmallVector<int64_t, 4> expanded_input_shape;
+    SmallVector<int64_t, 4> expanded_output_shape;
+    int expanded_rank = input_type.getRank() + 1;
+    int r = 0;
+    for (int i = 0; i < expanded_rank; ++i) {
+      if (i == expanded_rank - 2) {
+        expanded_input_shape.push_back(1);
+        expanded_output_shape.push_back(1);
+      } else {
+        expanded_input_shape.push_back(input_type.getDimSize(r));
+        expanded_output_shape.push_back(output_type.getDimSize(r));
+        r++;
+      }
+    }
+
+    auto expaned_input_type = mlir::RankedTensorType::get(
+        expanded_input_shape, input_type.getElementType());
+    TF::ExpandDimsOp expanded_input = rewriter.create<TF::ExpandDimsOp>(
+        rfft_op.getLoc(), expaned_input_type, input, minus_two->getResult());
+
+    // Expanded fft_len.
+    auto one_attr = mlir::DenseIntElementsAttr::get(one_ele_type, {1});
+
+    auto one = rewriter.create<TF::ConstOp>(rfft_op.getLoc(), one_attr);
+
+    auto zero = CreateConstOpWithSingleValue(&rewriter, rfft_op.getLoc(),
+                                             one_ele_type, 0);
+
+    auto expanded_fft_len_type =
+        mlir::RankedTensorType::get({2}, fft_len_type.getElementType());
+
+    TF::ConcatV2Op expanded_fft_len = rewriter.create<TF::ConcatV2Op>(
+        rfft_op.getLoc(), expanded_fft_len_type,
+        SmallVector<Value, 2>({one.getResult(), fft_len}), zero->getResult());
+
+    // Insert the rfft_2d.
+    auto rfft2d_out_type = mlir::RankedTensorType::get(
+        expanded_output_shape, output_type.getElementType());
+    TF::RFFT2DOp rfft2d = rewriter.create<TF::RFFT2DOp>(
+        rfft_op.getLoc(), rfft2d_out_type, expanded_input.getResult(),
+        expanded_fft_len.getResult());
+
+    // Insert the squeeze op.
+    auto squeeze_dim = rewriter.getI64ArrayAttr({-2});
+    TF::SqueezeOp squeeze = rewriter.create<TF::SqueezeOp>(
+        rfft_op.getLoc(), output_type, rfft2d.getResult(), squeeze_dim);
+
+    rewriter.replaceOp(op, squeeze.getResult());
+
+    return success();
+  }
+};
+
 void PrepareTFPass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
@@ -811,7 +911,8 @@ void PrepareTFPass::runOnFunction() {
                     TF::ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(ctx);
   }
   patterns.insert<TF::ConvertTFEinsumOp, ConvertTFBroadcastTo, ConvertTFConv2D,
-                  ConvertTFDepthwiseConv2dNative, ConvertTFStridedSlice>(ctx);
+                  ConvertTFDepthwiseConv2dNative, ConvertTFStridedSlice,
+                  ConvertRfftToRfft2d>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
index 8562f623258..b32da24d00f 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -30,80 +30,66 @@ stream_executor::port::StatusOr<ConstantOp> CreateConstOpWithSingleValue(
   Type element_type = shaped_type.getElementType();
   ShapedType scalar_type = RankedTensorType::get({}, element_type);
   Attribute attr;
-  switch (element_type.getKind()) {
-    case mlir::StandardTypes::F16: {
-      auto floatType = mlir::FloatType::getF16(element_type.getContext());
-      auto floatAttr =
-          mlir::FloatAttr::get(floatType, static_cast<float>(value));
-      std::vector<Attribute> floatValues({floatAttr});
-      attr = DenseElementsAttr::get(scalar_type, floatValues);
-      break;
-    }
-    case mlir::StandardTypes::BF16: {
-      auto floatType = mlir::FloatType::getBF16(element_type.getContext());
-      auto floatAttr =
-          mlir::FloatAttr::get(floatType, static_cast<float>(value));
-      std::vector<Attribute> floatValues({floatAttr});
-      attr = DenseElementsAttr::get(scalar_type, floatValues);
-      break;
-    }
-    case mlir::StandardTypes::F32: {
-      attr =
-          DenseElementsAttr::get<float>(scalar_type, static_cast<float>(value));
-      break;
-    }
-    case mlir::StandardTypes::Complex: {
-      auto etype = element_type.cast<mlir::ComplexType>().getElementType();
-      if (etype.isF32()) {
-        auto dialect = etype.getContext()->getRegisteredDialect("tf");
-        tensorflow::TensorProto repr;
-        repr.set_dtype(tensorflow::DT_COMPLEX64);
+  if (element_type.isF16()) {
+    auto floatType = mlir::FloatType::getF16(element_type.getContext());
+    auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
+    std::vector<Attribute> floatValues({floatAttr});
+    attr = DenseElementsAttr::get(scalar_type, floatValues);
+  } else if (element_type.isBF16()) {
+    auto floatType = mlir::FloatType::getBF16(element_type.getContext());
+    auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
+    std::vector<Attribute> floatValues({floatAttr});
+    attr = DenseElementsAttr::get(scalar_type, floatValues);
+  } else if (element_type.isF32()) {
+    attr =
+        DenseElementsAttr::get<float>(scalar_type, static_cast<float>(value));
+  } else if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
+    auto etype = complex_type.getElementType();
+    if (etype.isF32()) {
+      auto dialect = etype.getContext()->getLoadedDialect("tf");
+      tensorflow::TensorProto repr;
+      repr.set_dtype(tensorflow::DT_COMPLEX64);
 
-        tensorflow::TensorShapeProto* shape = repr.mutable_tensor_shape();
-        shape->set_unknown_rank(false);
-        shape->add_dim()->set_size(int64_t{1});
-        std::string content;
-        auto complex_value =
-            std::complex<float>(static_cast<float>(value), 0.0f);
-        content.assign(reinterpret_cast<const char*>(&complex_value),
-                       sizeof(complex_value));
-        repr.set_tensor_content(content);
-        std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
+      tensorflow::TensorShapeProto* shape = repr.mutable_tensor_shape();
+      shape->set_unknown_rank(false);
+      shape->add_dim()->set_size(int64_t{1});
+      std::string content;
+      auto complex_value = std::complex<float>(static_cast<float>(value), 0.0f);
+      content.assign(reinterpret_cast<const char*>(&complex_value),
+                     sizeof(complex_value));
+      repr.set_tensor_content(content);
+      std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
 
-        attr = mlir::OpaqueElementsAttr::get(dialect, scalar_type, mangled);
+      attr = mlir::OpaqueElementsAttr::get(dialect, scalar_type, mangled);
+    } else {
+      return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                                "Unsupported type");
+    }
+  } else if (auto itype = element_type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 8:
+        attr = DenseElementsAttr::get<int8_t>(scalar_type,
+                                              static_cast<int8_t>(value));
         break;
-      }
-      return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
-                                "Unsupported type");
+      case 16:
+        attr = DenseElementsAttr::get<int16_t>(scalar_type,
+                                               static_cast<int16_t>(value));
+        break;
+      case 32:
+        attr = DenseElementsAttr::get<int32_t>(scalar_type,
+                                               static_cast<int32_t>(value));
+        break;
+      case 64:
+        attr = DenseElementsAttr::get<int64_t>(scalar_type,
+                                               static_cast<int64_t>(value));
+        break;
+      default:
+        return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                                  "Unsupported type");
     }
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = element_type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 8:
-          attr = DenseElementsAttr::get<int8_t>(scalar_type,
-                                                static_cast<int8_t>(value));
-          break;
-        case 16:
-          attr = DenseElementsAttr::get<int16_t>(scalar_type,
-                                                 static_cast<int16_t>(value));
-          break;
-        case 32:
-          attr = DenseElementsAttr::get<int32_t>(scalar_type,
-                                                 static_cast<int32_t>(value));
-          break;
-        case 64:
-          attr = DenseElementsAttr::get<int64_t>(scalar_type,
-                                                 static_cast<int64_t>(value));
-          break;
-        default:
-          return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
-                                    "Unsupported type");
-      }
-      break;
-    }
-    default:
-      return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
-                                "Unsupported type");
+  } else {
+    return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                              "Unsupported type");
   }
   return rewriter->create<ConstantOp>(loc, scalar_type, attr);
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index 081ba7ac6e7..f26689fac5e 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -93,8 +93,9 @@ class LstmUtilsTest : public ::testing::Test {
   LstmUtilsTest() {}
 
   void SetUp() override {
-    RegisterDialects();
     context_ = std::make_unique<mlir::MLIRContext>();
+    context_->loadDialect<mlir::StandardOpsDialect, mlir::TF::TensorFlowDialect,
+                          TensorFlowLiteDialect>();
     builder_ = std::unique_ptr<mlir::Builder>(new Builder(context_.get()));
     fused_lstm_func_ = createLstmCompositeFunc(builder_.get(), false, false);
     fused_lstm_func_cifg_ =
@@ -109,12 +110,6 @@ class LstmUtilsTest : public ::testing::Test {
     builder_.reset();
   }
 
-  void RegisterDialects() {
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<TensorFlowLiteDialect>();
-  }
-
   FuncOp fused_lstm_func_;
   FuncOp fused_lstm_func_cifg_;
   FuncOp fused_ln_lstm_func_;
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index 96d22cb51e9..4035fed221d 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -56,9 +56,9 @@ inline OpaqueElementsAttr CustomOption(OpBuilder* builder,
                                        const std::string& content) {
   ShapedType type = RankedTensorType::get(
       {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
-  return OpaqueElementsAttr::get(
-      builder->getContext()->getRegisteredDialect("tfl"), type,
-      StringRef(content.data(), content.size()));
+  return OpaqueElementsAttr::get(builder->getContext()->getLoadedDialect("tfl"),
+                                 type,
+                                 StringRef(content.data(), content.size()));
 }
 
 inline TensorType GetInputType(FuncOp func, int idx) {
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 8be6facce38..00efffff144 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -128,6 +128,7 @@ Status MlirFunctionOptimizationPass::Run(
   GraphDebugInfo debug_info;
   RegisterDialects();
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   GraphImportConfig import_config;
   import_config.graph_as_function = true;
   import_config.control_outputs = *control_ret_node_names;
@@ -208,6 +209,7 @@ Status MlirV1CompatGraphOptimizationPass::Run(
   GraphDebugInfo debug_info;
   RegisterDialects();
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   GraphImportConfig import_config;
   import_config.upgrade_legacy = true;
   // Restrict functionalization to TPU nodes to avoid problems in v1 session
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 5ce0ca8cfcb..f1f6c43d3b3 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -41,6 +41,7 @@ std::string ImportGraphDef(const std::string &proto,
   GraphDebugInfo debug_info;
   GraphImportConfig specs;
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   auto module = ConvertGraphdefToMlir(graphdef, debug_info, specs, &context);
   if (!module.ok()) {
     Set_TF_Status_from_Status(status, module.status());
@@ -85,6 +86,7 @@ std::string ExperimentalConvertSavedModelToMlir(
   std::vector<string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   auto module_or = ConvertSavedModelToMlir(
       &bundle, &context, absl::Span<std::string>(exported_names));
   if (!module_or.status().ok()) {
@@ -115,6 +117,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   // Convert the SavedModelBundle to an MLIR module.
 
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   auto module_or =
       ConvertSavedModelV1ToMlir(bundle, {}, &context, upgrade_legacy);
   if (!module_or.status().ok()) {
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
index 63ca4c7bb28..4152b576e71 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
@@ -38,6 +38,7 @@ PYBIND11_MODULE(mlir_wrapper, m) {
     SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
                           llvm::SMLoc());
     mlir::MLIRContext ctx;
+    ctx.loadAllGloballyRegisteredDialects();
     auto module = mlir::parseSourceFile(SM, &ctx);
     if (!module) {
       return false;
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
index 2be67f8e93e..be2dc2065f3 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/types.cc
@@ -20,11 +20,6 @@ limitations under the License.
 void init_types(py::module& m) {
   // Type
   py::class_<mlir::Type> Type(m, "Type");
-  Type.def("getKind", &mlir::Type::getKind);
-
-  // Type Enums
-  py::enum_<mlir::StandardTypes::Kind>(Type, "StandardTypes_Kind")
-      .value("BF16", mlir::StandardTypes::BF16);
 
   // Type Sub-classes
   py::class_<mlir::FunctionType, mlir::Type>(m, "FunctionType")
@@ -32,7 +27,10 @@ void init_types(py::module& m) {
            [](mlir::FunctionType& ft) { return ft.getResults().vec(); });
 
   py::class_<mlir::FloatType, mlir::Type>(m, "FloatType")
-      .def("get", &mlir::FloatType::get);
+      .def("getBF16", &mlir::FloatType::getBF16)
+      .def("getF16", &mlir::FloatType::getF16)
+      .def("getF32", &mlir::FloatType::getF32)
+      .def("getF64", &mlir::FloatType::getF64);
 
   py::class_<mlir::IntegerType, mlir::Type>(m, "IntegerType")
       .def("get", py::overload_cast<unsigned, mlir::MLIRContext*>(
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index f9b1abcccc6..b8c7376ebd3 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -355,6 +355,7 @@ cc_library(
             "ir/tf_remaining_ops.h.inc",
         ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
         deps = [
+            ":attribute_utils",
             ":tensorflow_attributes",
             ":tensorflow_canonicalize_inc_gen",
             ":tensorflow_op_interfaces",
@@ -722,6 +723,7 @@ cc_library(
         "//tensorflow/core:framework",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
@@ -775,6 +777,7 @@ cc_library(
         "transforms/sink_constant.cc",
         "transforms/stack_ops_decomposition.cc",
         "transforms/tensor_array_ops_decomposition.cc",
+        "transforms/tensor_device_copy_conversion.cc",
         "transforms/tensor_list_ops_decomposition.cc",
         "transforms/test_resource_alias_analysis.cc",
         "transforms/test_side_effect_analysis.cc",
@@ -787,6 +790,7 @@ cc_library(
         "transforms/tpu_extract_head_tail_outside_compilation.cc",
         "transforms/tpu_extract_outside_compilation.cc",
         "transforms/tpu_host_computation_expansion.cc",
+        "transforms/tpu_identity_pruning.cc",
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_outside_compilation_cluster.cc",
         "transforms/tpu_rewrite_pass.cc",
@@ -799,7 +803,6 @@ cc_library(
         "translate/tf_functional_to_executor.cc",
     ],
     hdrs = [
-        "transforms/attribute_utils.h",
         "transforms/batchmatmul_to_einsum.h",
         "transforms/bridge.h",
         "transforms/collection_ops_util.h",
@@ -809,6 +812,7 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":attribute_utils",
         ":bridge_logger",
         ":convert_tensor",
         ":convert_type",
@@ -1269,7 +1273,7 @@ cc_library(
     name = "tf_dialect_passes",
     srcs = [
         "transforms/constant_fold.cc",
-        "transforms/dialect_hooks.cc",
+        "transforms/decode_attributes_hook.cc",
     ],
     hdrs = [
         "transforms/constant_fold.h",
@@ -1632,6 +1636,7 @@ cc_library(
     deps = [
         ":lower_tf_inc_gen",
         ":tensorflow",
+        ":tensorflow_ops",
         ":tensorflow_types",
         "//tensorflow/core:framework",
         "@llvm-project//llvm:Support",
@@ -1819,3 +1824,11 @@ cc_library(
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "attribute_utils",
+    hdrs = ["utils/attribute_utils.h"],
+    deps = [
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
index 7ad2705263b..8ec7513f81f 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Analysis/CallGraph.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -35,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -134,12 +137,46 @@ class BacktrackAnalysis {
     return GetAnalysisForRegion(region);
   }
 
+  // Returns the backtrack analysis for the given region if it exists.
+  // If the region has not yet been analyzed, returns llvm::None.
+  Optional<const InfoT*> GetAnalysisIfExists(Region& region) const {
+    auto it = info_map_.find(&region);
+    if (it == info_map_.end()) return llvm::None;
+    return &it->second;
+  }
+
+  Optional<const InfoT*> GetAnalysisIfExists(FuncOp func) const {
+    return GetAnalysisIfExists(func.getBody());
+  }
+
  private:
   llvm::SmallDenseMap<Region*, InfoT> info_map_;
 };
 
 // Analyzes all regions attached to all operations in the module.
 BacktrackAnalysis::BacktrackAnalysis(ModuleOp module) {
+  const CallGraph call_graph(module);
+
+  // Visit functions bottom up when doing the analysis. Note that SCC iterator
+  // has the property that if there is an edge from SCC1->SCC2, SCC1 is visited
+  // after SCC2, i.e., the graph is traversed bottom up just the way we want.
+  auto scc_begin = llvm::scc_begin(&call_graph);
+  auto scc_end = llvm::scc_end(&call_graph);
+  for (auto& scc : make_range(scc_begin, scc_end)) {
+    // Each SCC node is a collection of callgraph nodes that form a cycle. We
+    // will visit these nodes in an arbitrary order. If a node being visited
+    // calls a function that has not yet been analyzed, we will not be able to
+    // backtrack through that function call (our analysis will be correct but
+    // pessimistic).
+    for (CallGraphNode* node : scc) {
+      if (node->isExternal()) continue;
+      Region* region = node->getCallableRegion();
+      GetOrCreateAnalysis(*region);
+    }
+  }
+
+  // This above call graph analysis will cover all regions attached to functions
+  // but we also need to analyze regions attached to other ops.
   module.walk([this](Operation* op) {
     for (Region& region : op->getRegions()) GetOrCreateAnalysis(region);
   });
@@ -160,6 +197,18 @@ Value BacktrackAnalysis::BacktrackValue(Value value) {
       value = island.GetYield().getOperand(res_index);
     } else if (isa<IdentityNOp, IdentityOp>(op)) {
       value = op->getOperand(res_index);
+    } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+      FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
+      if (!func) break;
+      // Check if the function being called has been analyzed. if not,
+      // we cannot backtrack the value further.
+      Optional<const InfoT*> callee_info = GetAnalysisIfExists(func);
+      if (!callee_info) break;
+      Optional<int> passthrough_arg = callee_info.getValue()->GetArg(res_index);
+      if (!passthrough_arg) break;
+      value = call.getArgOperands()[passthrough_arg.getValue()];
+    } else if (isa<tf_device::LaunchOp, tf_device::ClusterOp>(op)) {
+      value = op->getRegion(0).front().getTerminator()->getOperand(res_index);
     } else {
       break;
     }
@@ -359,6 +408,13 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
           AddValueUniqueIDMapping(result, kUnknownResourceId);
         }
       }
+    } else if (isa<tf_device::LaunchOp, tf_device::ClusterOp>(op)) {
+      Region& region = op->getRegion(0);
+      const auto& body_info = backtrack_analysis.GetAnalysisForRegion(region);
+      for (auto result : filter_resources(op->getResults())) {
+        Value body_result = body_info.GetValue(result.getResultNumber());
+        PropagateInputToOutput(body_result, result);
+      }
     } else {
       assign_unknown_id_to_all(op->getResults());
     }
@@ -493,10 +549,7 @@ llvm::SmallSetVector<Value, 8> ResourceAliasAnalysisInfo::GetResourceAliases(
 // ResourceAliasAnalysis
 //===----------------------------------------------------------------------===//
 
-ResourceAliasAnalysis::ResourceAliasAnalysis(Operation* op) {
-  auto module = dyn_cast<ModuleOp>(op);
-  assert(module);
-
+ResourceAliasAnalysis::ResourceAliasAnalysis(ModuleOp module) {
   // Analyze all regions for backtracking info.
   detail::BacktrackAnalysis backtrack_analysis(module);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
index c965b5d7602..46bb57c942d 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
@@ -102,7 +102,7 @@ class ResourceAliasAnalysis : public detail::PerFunctionAggregateAnalysis<
                                   detail::ResourceAliasAnalysisInfo> {
  public:
   // Constructs analysis by analyzing the given module operation.
-  explicit ResourceAliasAnalysis(Operation* op);
+  explicit ResourceAliasAnalysis(ModuleOp module);
 };
 
 // Returns a range with just resource type values from the input range
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index e382bdb28c6..c78a7e403c4 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -320,10 +320,7 @@ SideEffectAnalysisInfo::DirectControlSuccessors(
 }
 }  // namespace detail
 
-SideEffectAnalysis::SideEffectAnalysis(Operation* op) {
-  auto module = dyn_cast<ModuleOp>(op);
-  assert(module);
-
+SideEffectAnalysis::SideEffectAnalysis(ModuleOp module) {
   // Analyze entire module for alias analysis info.
   ResourceAliasAnalysis alias_analysis(module);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
index c92c6e1882c..a75f7eb7dee 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
@@ -130,7 +130,7 @@ class SideEffectAnalysis : public detail::PerFunctionAggregateAnalysis<
                                detail::SideEffectAnalysisInfo> {
  public:
   // Constructs analysis by analyzing the given module operation.
-  explicit SideEffectAnalysis(Operation* op);
+  explicit SideEffectAnalysis(ModuleOp module);
 };
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
index 801e35280d6..243f4b5139f 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -41,6 +41,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index edf5d09b401..c62d62a2d3d 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/raw_ostream.h"
@@ -64,6 +65,9 @@ using tensorflow::AbstractTensorInterface;
 using tensorflow::dyn_cast;
 using tensorflow::OutputList;
 using tensorflow::string;
+using tensorflow::errors::FailedPrecondition;
+using tensorflow::errors::InvalidArgument;
+using tensorflow::errors::Unimplemented;
 using tensorflow::tracing::TracingContext;
 using tensorflow::tracing::TracingOperation;
 using tensorflow::tracing::TracingTensorHandle;
@@ -103,6 +107,9 @@ class MlirTensor : public TracingTensorHandle {
   }
 
   Value getValue() { return value_; }
+  Type getElementType() {
+    return value_.getType().cast<ShapedType>().getElementType();
+  }
 
   // For LLVM style RTTI.
   static bool classof(const AbstractTensorHandle* ptr) {
@@ -184,11 +191,18 @@ class MlirAbstractOp : public TracingOperation {
   }
 
  private:
+  // Return true is there are still unfilled ODS slots for adding more inputs.
+  bool IsNextODSArgAvailable();
+
   MLIRContext* context_;
   MlirFunctionContext* function_context_;
   SmallVector<Value, 8> operands_;
   llvm::StringMap<Attribute> attrs_;
   std::unique_ptr<OperationState> state_;
+  // This is the index of the next ODS operand that will be added with AddInput
+  // or AddInput;
+  int current_ods_input_ = 0;
+  const tensorflow::OpDef* op_def_ = nullptr;
   const char* op_name_ = nullptr;
   string tf_op_type_;
   // TODO(srbs): Use this.
@@ -244,12 +258,12 @@ class MlirFunctionContext : public TracingContext {
   Status Finalize(OutputList* outputs, AbstractFunction** f) override;
 
   Status RegisterFunction(AbstractFunction* func) override {
-    return tensorflow::errors::Unimplemented(
+    return Unimplemented(
         "Registering graph functions has not been implemented yet.");
   }
 
   Status RemoveFunction(const string& func) override {
-    return tensorflow::errors::Unimplemented(
+    return Unimplemented(
         "MlirFunctionContext::RemoveFunction has not been implemented yet.");
   }
 
@@ -264,9 +278,12 @@ class MlirFunctionContext : public TracingContext {
 
 Status MlirAbstractOp::Reset(const char* op, const char* device_name) {
   if (state_) {
-    return tensorflow::errors::FailedPrecondition(
-        "Reset called on already built op.");
+    return FailedPrecondition("Reset called on already built op.");
   }
+  TF_RETURN_IF_ERROR(
+      tensorflow::OpRegistry::Global()->LookUpOpDef(op, &op_def_));
+  assert(op_def_);
+
   tf_op_type_ = op;
   std::string name = "tf.";
   name += op;
@@ -277,13 +294,12 @@ Status MlirAbstractOp::Reset(const char* op, const char* device_name) {
 
 Status MlirAbstractOp::SetAttrType(const char* attr_name,
                                    tensorflow::DataType dtype) {
-  if (!state_) {
-    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
-                  "op_type must be specified before specifying attrs.");
-  }
+  if (!state_)
+    return FailedPrecondition(
+        "op_type must be specified before specifying attrs.");
   Type mlir_type;
   Builder builder(context_);
-  TF_RETURN_IF_ERROR(ConvertDataTypeToTensor(dtype, builder, &mlir_type));
+  TF_RETURN_IF_ERROR(ConvertDataType(dtype, builder, &mlir_type));
   attrs_[attr_name] = TypeAttr::get(mlir_type);
   return Status::OK();
 }
@@ -291,8 +307,7 @@ Status MlirAbstractOp::SetAttrType(const char* attr_name,
 Status MlirAbstractOp::SetOpName(const char* const op_name) {
   // TODO(aminim): should we use a location?
   if (op_name_) {
-    return tensorflow::errors::FailedPrecondition(
-        "SetOpName called on already built op.");
+    return FailedPrecondition("SetOpName called on already built op.");
   }
   op_name_ = op_name;
   return Status::OK();
@@ -301,8 +316,7 @@ Status MlirAbstractOp::SetOpName(const char* const op_name) {
 Status MlirAbstractOp::AddRef(Type type, Type* output_type) {
   Type elt_type = getElementTypeOrSelf(type);
   if (elt_type.isa<mlir::TF::TensorFlowRefType>()) {
-    return tensorflow::errors::InvalidArgument(
-        "Requested reference to a reference type");
+    return InvalidArgument("Requested reference to a reference type");
   }
   elt_type = TensorFlowRefType::get(elt_type);
   if (RankedTensorType tensor_type = type.dyn_cast<RankedTensorType>()) {
@@ -315,138 +329,97 @@ Status MlirAbstractOp::AddRef(Type type, Type* output_type) {
 Status MlirAbstractOp::Create(ArrayRef<Value> operands,
                               OperationState** state) {
   state_->operands = llvm::to_vector<4>(operands);
-  const tensorflow::OpDef* op_def;
-  auto node_name = state_->name.getStringRef().drop_front(
-      TensorFlowDialect::getDialectNamespace().size() + 1);
-  TF_RETURN_IF_ERROR(
-      tensorflow::OpRegistry::Global()->LookUpOpDef(node_name.str(), &op_def));
   Builder builder(context_);
-  // Process operands according to the op_def and infer derived attributes.
-  int current_operand = 0;
-  for (const tensorflow::OpDef::ArgDef& input_arg : op_def->input_arg()) {
-    if (!input_arg.number_attr().empty()) {
-      // TODO(b/156122856): we don't support variadic operands.
-      return tensorflow::errors::Unimplemented(
-          "Unsupported 'number_attr' for '", input_arg.number_attr(), "'");
-    } else if (!input_arg.type_list_attr().empty()) {
-      return tensorflow::errors::InvalidArgument(
-          "Unsupported 'type_list_attr' for '", input_arg.number_attr(), "'");
-    }
-    if (current_operand >= operands.size()) {
-      return tensorflow::errors::InvalidArgument("Missing operand for '",
-                                                 input_arg.name(), "'");
-    }
-    Type expected_type;
-    if (input_arg.type() != tensorflow::DT_INVALID) {
-      TF_RETURN_IF_ERROR(
-          ConvertDataTypeToTensor(input_arg.type(), builder, &expected_type));
-      Type output_type;
-      if (input_arg.is_ref())
-        TF_RETURN_IF_ERROR(AddRef(expected_type, &output_type));
-      expected_type = output_type;
-    } else {
-      expected_type = operands[current_operand].getType();
-    }
-    if (!input_arg.type_attr().empty()) {
-      attrs_[input_arg.type_attr()] = TypeAttr::get(expected_type);
-    }
-    ++current_operand;
-  }
 
-  for (const tensorflow::OpDef::ArgDef& output_arg : op_def->output_arg()) {
+  if (current_ods_input_ != op_def_->input_arg_size())
+    return InvalidArgument(absl::StrCat("Mismatch in operands number: got ",
+                                        current_ods_input_, " expected ",
+                                        op_def_->input_arg_size(), " ; for op ",
+                                        state_->name.getStringRef().str()));
+
+  // Process results according to the op_def and infer types for derived
+  // attributes.
+  for (const tensorflow::OpDef::ArgDef& output_arg : op_def_->output_arg()) {
     int original_size = state_->types.size();
     if (!output_arg.number_attr().empty()) {
       // Same type repeated "repeats" times.
       Attribute repeats_attr = attrs_[output_arg.number_attr()];
-      if (!repeats_attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Missing attribute '", output_arg.number_attr(),
-            "' required for output list '", output_arg.name(), "'");
-      }
-      if (!repeats_attr.isa<IntegerAttr>()) {
-        return tensorflow::errors::InvalidArgument(
-            "Attribute '", output_arg.number_attr(),
-            "' required for output list '", output_arg.name(),
-            "' isn't an integer");
-      }
+      if (!repeats_attr)
+        return InvalidArgument("Missing attribute '", output_arg.number_attr(),
+                               "' required for output list '",
+                               output_arg.name(), "'");
+      if (!repeats_attr.isa<IntegerAttr>())
+        return InvalidArgument("Attribute '", output_arg.number_attr(),
+                               "' required for output list '",
+                               output_arg.name(), "' isn't an integer");
       int64_t repeats = repeats_attr.cast<IntegerAttr>().getInt();
 
       if (!output_arg.type_attr().empty()) {
         // Same type repeated "repeats" times.
         Attribute attr = attrs_[output_arg.type_attr()];
-        if (!attr) {
-          return tensorflow::errors::InvalidArgument(
-              "Missing attribute '", output_arg.type_attr(),
-              "' required for output '", output_arg.name(), "'");
-        }
+        if (!attr)
+          return InvalidArgument("Missing attribute '", output_arg.type_attr(),
+                                 "' required for output '", output_arg.name(),
+                                 "'");
         TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
-        if (!type_attr) {
-          return tensorflow::errors::InvalidArgument(
-              "Attribute '", output_arg.type_attr(), "' required for output '",
-              output_arg.name(), "' isn't a type attribute");
-        }
+        if (!type_attr)
+          return InvalidArgument("Attribute '", output_arg.type_attr(),
+                                 "' required for output '", output_arg.name(),
+                                 "' isn't a type attribute");
         for (int i = 0; i < repeats; ++i)
-          state_->types.push_back(type_attr.getType());
+          state_->types.push_back(UnrankedTensorType::get(type_attr.getType()));
       } else if (output_arg.type() != tensorflow::DT_INVALID) {
         for (int i = 0; i < repeats; ++i) {
           Type type;
           TF_RETURN_IF_ERROR(
-              ConvertDataTypeToTensor(output_arg.type(), builder, &type));
+              ConvertDataType(output_arg.type(), builder, &type));
           state_->types.push_back(type);
         }
       } else {
-        return tensorflow::errors::InvalidArgument(
-            "Missing type or type_attr field in ",
-            output_arg.ShortDebugString());
+        return InvalidArgument("Missing type or type_attr field in ",
+                               output_arg.ShortDebugString());
       }
     } else if (!output_arg.type_attr().empty()) {
       Attribute attr = attrs_[output_arg.type_attr()];
-      if (!attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Missing attribute '", output_arg.type_attr(),
-            "' required for output '", output_arg.name(), "'");
-      }
+      if (!attr)
+        return InvalidArgument("Missing attribute '", output_arg.type_attr(),
+                               "' required for output '", output_arg.name(),
+                               "'");
       TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
-      if (!type_attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Attribute '", output_arg.type_attr(), "' required for output '",
-            output_arg.name(), "' isn't a type attribute");
-      }
-      state_->types.push_back(type_attr.getValue());
+      if (!type_attr)
+        return InvalidArgument("Attribute '", output_arg.type_attr(),
+                               "' required for output '", output_arg.name(),
+                               "' isn't a type attribute");
+      state_->types.push_back(UnrankedTensorType::get(type_attr.getValue()));
     } else if (!output_arg.type_list_attr().empty()) {
       // This is pointing to an attribute which is an array of types.
       Attribute attr = attrs_[output_arg.type_list_attr()];
-      if (!attr) {
-        return tensorflow::errors::InvalidArgument(
+      if (!attr)
+        return InvalidArgument(
             "Missing attribute '", output_arg.type_list_attr(),
             "' required for output '", output_arg.name(), "'");
-      }
       ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>();
-      if (!array_attr) {
-        return tensorflow::errors::InvalidArgument(
-            "Attribute '", output_arg.type_list_attr(),
-            "' required for output '", output_arg.name(),
-            "' isn't an array attribute");
-      }
+      if (!array_attr)
+        return InvalidArgument("Attribute '", output_arg.type_list_attr(),
+                               "' required for output '", output_arg.name(),
+                               "' isn't an array attribute");
       for (Attribute attr : array_attr) {
         TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
-        if (!type_attr) {
-          return tensorflow::errors::InvalidArgument(
-              "Array Attribute '", output_arg.type_list_attr(),
-              "' required for output '", output_arg.name(),
-              "' has a non-Type element");
-        }
-        state_->types.push_back(type_attr.getValue());
+        if (!type_attr)
+          return InvalidArgument("Array Attribute '",
+                                 output_arg.type_list_attr(),
+                                 "' required for output '", output_arg.name(),
+                                 "' has a non-Type element");
+        state_->types.push_back(UnrankedTensorType::get(type_attr.getValue()));
       }
     } else if (output_arg.type() != tensorflow::DT_INVALID) {
       Type type;
       Builder builder(context_);
-      TF_RETURN_IF_ERROR(
-          ConvertDataTypeToTensor(output_arg.type(), builder, &type));
+      TF_RETURN_IF_ERROR(ConvertDataType(output_arg.type(), builder, &type));
       state_->types.push_back(type);
     } else {
-      return tensorflow::errors::InvalidArgument("No type fields in ",
-                                                 output_arg.ShortDebugString());
+      return InvalidArgument("No type fields in ",
+                             output_arg.ShortDebugString());
     }
     if (output_arg.is_ref()) {
       // For all types that were added by this function call, make them refs.
@@ -472,88 +445,67 @@ Status MlirAbstractOp::SetDeviceName(const char* name) {
   return Status::OK();
 }
 
-Status MlirAbstractOp::AddInputList(
-    absl::Span<AbstractTensorHandle* const> inputs) {
-  return tensorflow::errors::Unimplemented(
-      "AddInputList has not been implemented yet.");
-}
-
 Status MlirAbstractOp::SetAttrString(const char* attr_name, const char* data,
                                      size_t length) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrString has not been implemented yet.");
+  return Unimplemented("SetAttrString has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrInt(const char* attr_name, int64_t value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrInt has not been implemented yet.");
+  return Unimplemented("SetAttrInt has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFloat(const char* attr_name, float value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFloat has not been implemented yet.");
+  return Unimplemented("SetAttrFloat has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrBool(const char* attr_name, bool value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrBool has not been implemented yet.");
+  return Unimplemented("SetAttrBool has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrShape(const char* attr_name, const int64_t* dims,
                                     const int num_dims) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrShape has not been implemented yet.");
+  return Unimplemented("SetAttrShape has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFunction(const char* attr_name,
                                        const AbstractOperation* value) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFunction has not been implemented yet.");
+  return Unimplemented("SetAttrFunction has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFunctionName(const char* attr_name,
                                            const char* value, size_t length) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFunctionName has not been implemented yet.");
+  return Unimplemented("SetAttrFunctionName has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrTensor(const char* attr_name,
                                      AbstractTensorInterface* tensor) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrTensor has not been implemented yet.");
+  return Unimplemented("SetAttrTensor has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrStringList(const char* attr_name,
                                          const void* const* values,
                                          const size_t* lengths,
                                          int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrStringList has not been implemented yet.");
+  return Unimplemented("SetAttrStringList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFloatList(const char* attr_name,
                                         const float* values, int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFloatList has not been implemented yet.");
+  return Unimplemented("SetAttrFloatList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrIntList(const char* attr_name,
                                       const int64_t* values, int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrIntList has not been implemented yet.");
+  return Unimplemented("SetAttrIntList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrTypeList(const char* attr_name,
                                        const tensorflow::DataType* values,
                                        int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrTypeList has not been implemented yet.");
+  return Unimplemented("SetAttrTypeList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrBoolList(const char* attr_name,
                                        const unsigned char* values,
                                        int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrBoolList has not been implemented yet.");
+  return Unimplemented("SetAttrBoolList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrShapeList(const char* attr_name,
                                         const int64_t** dims,
                                         const int* num_dims, int num_values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrShapeList has not been implemented yet.");
+  return Unimplemented("SetAttrShapeList has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrFunctionList(
     const char* attr_name, absl::Span<const AbstractOperation*> values) {
-  return tensorflow::errors::Unimplemented(
-      "SetAttrFunctionList has not been implemented yet.");
+  return Unimplemented("SetAttrFunctionList has not been implemented yet.");
 }
 
 Status MlirFunction::GetFunctionDef(tensorflow::FunctionDef** f) {
@@ -605,28 +557,101 @@ Status MlirFunctionContext::AddParameter(tensorflow::DataType dtype,
 }
 
 Status MlirAbstractOp::AddInput(AbstractTensorHandle* input) {
+  if (current_ods_input_ >= op_def_->input_arg_size())
+    return InvalidArgument(
+        absl::StrCat("More Input() (", current_ods_input_, ") calls than the ",
+                     op_def_->input_arg_size(), " allowed input_args ; for op ",
+                     state_->name.getStringRef().str()));
+
   auto* operand = dyn_cast<MlirTensor>(input);
-  if (!operand) {
-    return tensorflow::errors::InvalidArgument(
-        "Unable to cast input to MlirTensor");
-  }
+  if (!operand) return InvalidArgument("Unable to cast input to MlirTensor");
   operands_.push_back(operand->getValue());
+
+  // Get the next ArgDef and use it to infer the derived attributes associated
+  // to this input.
+  const tensorflow::OpDef::ArgDef& arg_def =
+      op_def_->input_arg(current_ods_input_++);
+  Type expected_type;
+  if (arg_def.type() != tensorflow::DT_INVALID) {
+    Builder builder(context_);
+    TF_RETURN_IF_ERROR(
+        tensorflow::ConvertDataType(arg_def.type(), builder, &expected_type));
+    if (arg_def.is_ref()) {
+      Type output_type;
+      TF_RETURN_IF_ERROR(AddRef(expected_type, &output_type));
+      expected_type = output_type;
+    }
+  } else {
+    expected_type = cast<MlirTensor>(input)->getElementType();
+  }
+  if (!arg_def.type_attr().empty())
+    attrs_[arg_def.type_attr()] = TypeAttr::get(expected_type);
+
   return Status::OK();
 }
+
+Status MlirAbstractOp::AddInputList(
+    absl::Span<AbstractTensorHandle* const> inputs) {
+  if (current_ods_input_ >= op_def_->input_arg_size())
+    return InvalidArgument(
+        absl::StrCat("More Input() (", current_ods_input_, ") calls than the ",
+                     op_def_->input_arg_size(), " allowed input_args"));
+
+  for (AbstractTensorHandle* input : inputs) {
+    auto* operand = dyn_cast<MlirTensor>(input);
+    if (!operand) return InvalidArgument("Unable to cast input to MlirTensor");
+    operands_.push_back(operand->getValue());
+  }
+
+  // Get the next ArgDef and use it to infer the derived attributes associated
+  // to this input.
+  const tensorflow::OpDef::ArgDef& arg_def =
+      op_def_->input_arg(current_ods_input_++);
+  if (!arg_def.number_attr().empty()) {
+    Builder builder(context_);
+    attrs_[arg_def.number_attr()] = builder.getI32IntegerAttr(inputs.size());
+    // TODO(aminim): handle ref variable.
+    if (arg_def.type() != tensorflow::DT_INVALID) {
+      // TODO(aminim): check type wrt input
+      Type arg_def_type;
+      TF_RETURN_IF_ERROR(
+          ConvertDataType(arg_def.type(), builder, &arg_def_type));
+      // Ensure each of the type in the list matches the op def type.
+      // TODO(aminim): can we improve the error message with the actual types?
+      for (AbstractTensorHandle* input : inputs)
+        if (arg_def_type != cast<MlirTensor>(input)->getElementType())
+          return InvalidArgument(
+              "Invalid input list: type mismatch the op def expectation");
+    } else if (!inputs.empty()) {
+      if (arg_def.type_attr().empty())
+        return FailedPrecondition(
+            "Invalid opdef type constraint: either type or type_attr required");
+
+      attrs_[arg_def.type_attr()] =
+          TypeAttr::get(cast<MlirTensor>(inputs.front())->getElementType());
+    }
+  } else if (!arg_def.type_list_attr().empty()) {
+    // TODO(aminim): handle ref variable.
+    SmallVector<Attribute, 8> types;
+    types.reserve(inputs.size());
+    for (AbstractTensorHandle* input : inputs)
+      types.push_back(TypeAttr::get(cast<MlirTensor>(input)->getElementType()));
+    attrs_[arg_def.type_list_attr()] = ArrayAttr::get(types, GetContext());
+  }
+  return Status::OK();
+}
+
 Status MlirFunctionContext::Finalize(OutputList* outputs,
                                      AbstractFunction** f) {
   Block& body = func_.getBody().front();
   SmallVector<Value, 8> ret_operands;
   for (auto* output : outputs->outputs) {
     auto* operand = dyn_cast<MlirTensor>(output);
-    if (!operand) {
-      return tensorflow::errors::InvalidArgument(
-          "Capturing eager tensors is not supported yet.");
-    }
-    if (operand->getValue().getContext() != context_.get()) {
-      return tensorflow::errors::InvalidArgument(
+    if (!operand)
+      return InvalidArgument("Capturing eager tensors is not supported yet.");
+    if (operand->getValue().getContext() != context_.get())
+      return InvalidArgument(
           "Capturing tensors from other context is not supported.");
-    }
     ret_operands.push_back(operand->getValue());
   }
   builder_.create<ReturnOp>(func_.getLoc(), ret_operands);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
index dfad1fce26d..40cc2c99c27 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
@@ -74,12 +74,9 @@ struct FuncAttrStorage : public AttributeStorage {
 // Get or create a shape attribute.
 ShapeAttr ShapeAttr::get(mlir::MLIRContext* context,
                          llvm::Optional<ArrayRef<int64_t>> shape) {
-  if (shape)
-    return Base::get(context, AttrKind::SHAPE, *shape,
-                     /*unranked=*/false);
+  if (shape) return Base::get(context, *shape, /*unranked=*/false);
 
-  return Base::get(context, AttrKind::SHAPE, ArrayRef<int64_t>(),
-                   /*unranked=*/true);
+  return Base::get(context, ArrayRef<int64_t>(), /*unranked=*/true);
 }
 
 llvm::Optional<ArrayRef<int64_t>> ShapeAttr::getValue() const {
@@ -112,12 +109,12 @@ bool ShapeAttr::hasStaticShape() const {
 FuncAttr FuncAttr::get(mlir::MLIRContext* context, llvm::StringRef name,
                        DictionaryAttr attr) {
   auto symbol = SymbolRefAttr::get(name, context);
-  return Base::get(context, AttrKind::FUNC, symbol, attr);
+  return Base::get(context, symbol, attr);
 }
 
 FuncAttr FuncAttr::get(mlir::MLIRContext* context, SymbolRefAttr symbol,
                        DictionaryAttr attr) {
-  return Base::get(context, AttrKind::FUNC, symbol, attr);
+  return Base::get(context, symbol, attr);
 }
 
 SymbolRefAttr FuncAttr::GetName() const {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
index e0fef228eb4..5a18b77ab5c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -24,19 +24,6 @@ limitations under the License.
 namespace mlir {
 namespace TF {
 
-namespace AttrKind {
-
-// List of supported custom TensorFlow Attribute kinds, necessary for
-// isa/dyn_cast.
-enum Kind {
-  FIRST_USED_TENSORFLOW_ATTR = Attribute::FIRST_TENSORFLOW_ATTR,
-  SHAPE = FIRST_USED_TENSORFLOW_ATTR,
-  FUNC,
-  LAST_USED_TENSORFLOW_ATTR,
-};
-
-}  // namespace AttrKind
-
 namespace detail {
 
 struct ShapeAttrStorage;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 9c2968fab37..ea9ae5d9477 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -54,9 +54,6 @@ namespace tf_executor {
 
 namespace {
 
-using TF::DropRefType;
-using TF::DropTypeSubTypes;
-
 struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
@@ -75,9 +72,8 @@ struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   }
 };
 
-struct TensorFlowExecutorOpFolderDialectInterface
-    : public OpFolderDialectInterface {
-  using OpFolderDialectInterface::OpFolderDialectInterface;
+struct TensorFlowExecutorDialectFoldInterface : public DialectFoldInterface {
+  using DialectFoldInterface::DialectFoldInterface;
 
   // Registered hook to check if the given region, which is attached to an
   // operation that is *not* isolated from above (i.e. no internal regions
@@ -100,7 +96,7 @@ TensorFlowExecutorDialect::TensorFlowExecutorDialect(MLIRContext *context)
       >();
 
   addInterfaces<TensorFlowExecutorInlinerInterface,
-                TensorFlowExecutorOpFolderDialectInterface>();
+                TensorFlowExecutorDialectFoldInterface>();
 
   addTypes<ControlType, TokenType>();
 }
@@ -551,8 +547,8 @@ LogicalResult Verify(SwitchNOp switchn) {
              << operand0_tensor_type << " vs " << output_tensor_type;
     }
     Type broadcasted_type = OpTrait::util::getBroadcastedType(
-        DropRefType(DropTypeSubTypes(operand0_tensor_type)),
-        DropRefType(DropTypeSubTypes(output_tensor_type)));
+        TF::DropRefAndSubTypes(operand0_tensor_type),
+        TF::DropRefAndSubTypes(output_tensor_type));
     if (!broadcasted_type) {
       return switchn.emitOpError()
              << "expects data operand to be broadcastable with all output types"
@@ -668,8 +664,8 @@ LogicalResult Verify(MergeOp merge) {
              << operand_tensor_ty << " vs " << output_tensor_ty;
     }
     Type broadcasted_type = OpTrait::util::getBroadcastedType(
-        DropRefType(DropTypeSubTypes(output_tensor_ty)),
-        DropRefType(DropTypeSubTypes(operand_tensor_ty)));
+        TF::DropRefAndSubTypes(output_tensor_ty),
+        TF::DropRefAndSubTypes(operand_tensor_ty));
     if (!broadcasted_type)
       return merge.emitOpError()
              << "expects all operands to be broadcastable with output type"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
index da63826a6d4..60036ddc9f8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
@@ -45,31 +45,16 @@ class TensorFlowExecutorDialect : public Dialect {
   void printType(Type type, DialectAsmPrinter &os) const override;
 };
 
-namespace TFTypes {
-enum Kind {
-  Control = Type::FIRST_TENSORFLOW_EXECUTOR_TYPE,
-  Token,
-};
-}  // namespace TFTypes
-
 // The Control type is a token-like value that models control dependencies from
 // TensorFlow graphs.
 class ControlType : public Type::TypeBase<ControlType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static ControlType get(MLIRContext *context) {
-    return Base::get(context, TFTypes::Control);
-  }
 };
 
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static TokenType get(MLIRContext *context) {
-    return Base::get(context, TFTypes::Token);
-  }
 };
 
 // Declares the operations for this dialect using the generated header.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index cc07d50eee2..283e3326029 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -136,7 +136,7 @@ Inputs must be of same size and shape.
   let hasFolder = 1;
 }
 
-def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
+def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns x + y element-wise.";
 
@@ -859,15 +859,15 @@ about broadcasting
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$x,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$y,
 
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -965,6 +965,40 @@ reverse of SpaceToBatch.  See below for a precise description.
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_BetaincOp : TF_Op<"Betainc", [NoSideEffect]> {
+  let summary = [{
+Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+  }];
+
+  let description = [{
+The regularized incomplete beta integral is defined as:
+
+
+\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+
+where
+
+
+\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+
+
+is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+beta function.
+  }];
+
+  let arguments = (ins
+    TF_F32OrF64Tensor:$a,
+    TF_F32OrF64Tensor:$b,
+    TF_F32OrF64Tensor:$x
+  );
+
+  let results = (outs
+    TF_F32OrF64Tensor:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect]> {
   let summary = "Adds `bias` to `value`.";
 
@@ -1319,6 +1353,7 @@ subsequent operation and then be optimized away, however.)
   let verifier = [{
     return Verify(*this);
   }];
+  let hasFolder = 1;
 }
 
 def TF_BucketizeOp : TF_Op<"Bucketize", [NoSideEffect, SameOperandsAndResultShape]> {
@@ -1404,6 +1439,38 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CholeskyOp : TF_Op<"Cholesky", [NoSideEffect]> {
+  let summary = [{
+Computes the Cholesky decomposition of one or more square matrices.
+  }];
+
+  let description = [{
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices.
+
+The input has to be symmetric and positive definite. Only the lower-triangular
+part of the input will be used for this operation. The upper-triangular part
+will not be read.
+
+The output is a tensor of the same shape as the input
+containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+
+**Note**: The gradient computation on GPU is faster for large matrices but
+not for large batch dimensions when the submatrices are small. In this
+case it might be faster to use the CPU.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = "Clips tensor values to a specified min and max.";
 
@@ -2025,17 +2092,73 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F32, I32, TF_Uint32]>:$input,
+    TensorOf<[BF16, F16, F32, I32, TF_Uint32]>:$input,
     I32Tensor:$group_assignment
   );
 
   let results = (outs
-    TensorOf<[BF16, F32, I32, TF_Uint32]>:$output
+    TensorOf<[BF16, F16, F32, I32, TF_Uint32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CumprodOp : TF_Op<"Cumprod", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
+  let summary = [{
+Compute the cumulative product of the tensor `x` along `axis`.
+  }];
+
+  let description = [{
+By default, this op performs an inclusive cumprod, which means that the first
+element of the input is identical to the first element of the output:
+
+```python
+tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed instead:
+
+```python
+tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+opposite direction:
+
+```python
+tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+```
+
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+
+```python
+tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TF_I32OrI64Tensor:$axis,
+
+    DefaultValuedAttr<BoolAttr, "false">:$exclusive,
+    DefaultValuedAttr<BoolAttr, "false">:$reverse
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
   let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
 
@@ -2084,6 +2207,10 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 def TF_DataFormatDimMapOp : TF_Op<"DataFormatDimMap", [NoSideEffect, SameOperandsAndResultType]> {
@@ -2109,6 +2236,82 @@ the source data format.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Permute input tensor from `src_format` to `dst_format`.";
+
+  let description = [{
+Input tensor must be a vector of size 4, or a 4x2 tensor.
+
+For example, with `src_format` of `NHWC`, `dst_format` of `NCHW`, and inputs:
+```
+[1, 2, 3, 4]
+```
+and
+```
+[[1, 2, 3, 4],
+ [5, 6, 7, 8]]
+```
+, the outputs will be (respectively):
+```
+[1, 4, 2, 3]
+```
+and
+```
+[[1, 4, 2, 3],
+ [5, 8, 6, 7]]
+```
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$x,
+
+    DefaultValuedAttr<StrAttr, "NHWC">:$src_format,
+    DefaultValuedAttr<StrAttr, "NCHW">:$dst_format
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{ return Verify(*this); }];
+}
+
+def TF_DebugIdentityV2Op : TF_Op<"DebugIdentityV2", []> {
+  let summary = "Debug Identity V2 Op.";
+
+  let description = [{
+Provides an identity mapping from input to output, while writing the content of
+the input tensor by calling DebugEventsWriter.
+
+The semantics of the input tensor depends on tensor_debug_mode. In typical
+usage, the input tensor comes directly from the user computation only when
+graph_debug_mode is FULL_TENSOR (see protobuf/debug_event.proto for a
+list of all the possible values of graph_debug_mode). For the other debug modes,
+the input tensor should be produced by an additional op or subgraph that
+computes summary information about one or more tensors.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$tfdbg_context_id,
+    StrAttr:$op_name,
+    DefaultValuedAttr<I64Attr, "-1">:$output_slot,
+    DefaultValuedAttr<I64Attr, "-1">:$tensor_debug_mode,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$debug_urls,
+    DefaultValuedAttr<I64Attr, "1000">:$circular_buffer_size,
+    StrAttr:$tfdbg_run_id
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_DecodeAndCropJpegOp : TF_Op<"DecodeAndCropJpeg", [NoSideEffect]> {
   let summary = "Decode and Crop a JPEG-encoded image to a uint8 tensor.";
 
@@ -2402,6 +2605,54 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_DepthwiseConv2dNativeBackpropFilterOp : TF_Op<"DepthwiseConv2dNativeBackpropFilter", [NoSideEffect]> {
+  let summary = [{
+Computes the gradients of depthwise convolution with respect to the filter.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$input,
+    I32Tensor:$filter_sizes,
+    TF_FpTensor:$out_backprop,
+
+    I64ArrayAttr:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_DepthwiseConv2dNativeBackpropInputOp : TF_Op<"DepthwiseConv2dNativeBackpropInput", [NoSideEffect]> {
+  let summary = [{
+Computes the gradients of depthwise convolution with respect to the input.
+  }];
+
+  let arguments = (ins
+    I32Tensor:$input_sizes,
+    TF_FpTensor:$filter,
+    TF_FpTensor:$out_backprop,
+
+    I64ArrayAttr:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
@@ -2421,6 +2672,40 @@ this op runs. The length of the list is returned in two cases:
   );
 }
 
+def TF_DiagOp : TF_Op<"Diag", [NoSideEffect, SameOperandsAndResultElementType]> {
+  let summary = "Returns a diagonal tensor with a given diagonal values.";
+
+  let description = [{
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+
+`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+
+For example:
+
+```
+# 'diagonal' is [1, 2, 3, 4]
+tf.diag(diagonal) ==> [[1, 0, 0, 0]
+                       [0, 2, 0, 0]
+                       [0, 0, 3, 0]
+                       [0, 0, 0, 4]]
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$diagonal
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_DiagPartOp : TF_Op<"DiagPart", [NoSideEffect]> {
   let summary = "Returns the diagonal part of the tensor.";
 
@@ -3075,6 +3360,27 @@ i.e. `exp(x) - 1` or `e^(x) - 1`, where `x` is the input tensor.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ExtractImagePatchesOp : TF_Op<"ExtractImagePatches", [NoSideEffect]> {
+  let summary = [{
+Extract `patches` from `images` and put them in the "depth" output dimension.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$images,
+
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksizes,
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
+    Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$rates,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$patches
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_FFTOp : TF_Op<"FFT", [NoSideEffect]> {
   let summary = "Fast Fourier transform.";
 
@@ -4185,6 +4491,22 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_InfeedDequeueOp : TF_Op<"InfeedDequeue", []> {
+  let summary = [{
+A placeholder op for a value that will be fed into the computation.
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_InitializeTableFromTextFileV2Op : TF_Op<"InitializeTableFromTextFileV2", []> {
   let summary = "Initializes a table from a text file.";
 
@@ -4730,6 +5052,49 @@ tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_ListDiffOp : TF_Op<"ListDiff", [NoSideEffect]> {
+  let summary = [{
+Computes the difference between two lists of numbers or strings.
+  }];
+
+  let description = [{
+Given a list `x` and a list `y`, this operation returns a list `out` that
+represents all values that are in `x` but not in `y`. The returned list `out`
+is sorted in the same order that the numbers appear in `x` (duplicates are
+preserved). This operation also returns a list `idx` that represents the
+position of each `out` element in `x`. In other words:
+
+`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+
+For example, given this input:
+
+```
+x = [1, 2, 3, 4, 5, 6]
+y = [1, 3, 5]
+```
+
+This operation would return:
+
+```
+out ==> [2, 4, 6]
+idx ==> [1, 3, 5]
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    TF_Tensor:$y
+  );
+
+  let results = (outs
+    TF_Tensor:$out,
+    TF_I32OrI64Tensor:$idx
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_idx = TF_DerivedResultTypeAttr<1>;
+}
+
 def TF_LogOp : TF_Op<"Log", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes natural logarithm of x element-wise.";
 
@@ -4913,6 +5278,44 @@ def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   );
 }
 
+def TF_LowerBoundOp : TF_Op<"LowerBound", [NoSideEffect]> {
+  let summary = [{
+Applies lower_bound(sorted_search_values, values) along each row.
+  }];
+
+  let description = [{
+Each set of rows with the same index in (sorted_inputs, values) is treated
+independently.  The resulting row is the equivalent of calling
+`np.searchsorted(sorted_inputs, values, side='left')`.
+
+The result is not a global index to the entire
+`Tensor`, but rather just the index in the last dimension.
+
+A 2-D example:
+  sorted_sequence = [[0, 3, 9, 9, 10],
+                     [1, 2, 3, 4, 5]]
+  values = [[2, 4, 9],
+            [0, 2, 6]]
+
+  result = LowerBound(sorted_sequence, values)
+
+  result == [[1, 2, 2],
+             [0, 1, 5]]
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$sorted_inputs,
+    TF_Tensor:$values
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_type = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_MatMulOp : TF_Op<"MatMul", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = [{
 Multiply the matrix "a" by the matrix "b".
@@ -5422,6 +5825,36 @@ tf.matrix_diag(diagonal, k = -1, num_rows = 3, padding_value = 9)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixInverseOp : TF_Op<"MatrixInverse", [NoSideEffect]> {
+  let summary = [{
+Computes the inverse of one or more square invertible matrices or their adjoints (conjugate transposes).
+  }];
+
+  let description = [{
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the inverse for all input submatrices `[..., :, :]`.
+
+The op uses LU decomposition with partial pivoting to compute the inverses.
+
+If a matrix is not invertible there is no guarantee what the op does. It
+may detect the condition and raise an exception or it may simply return a
+garbage result.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input,
+
+    DefaultValuedAttr<BoolAttr, "false">:$adjoint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_MatrixSetDiagOp : TF_Op<"MatrixSetDiag", [NoSideEffect]> {
   let summary = [{
 Returns a batched matrix tensor with new batched diagonal values.
@@ -5673,6 +6106,100 @@ tf.matrix_set_diag(input, diagonals, k = (-1, 2), align="LEFT_RIGHT")
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixSolveOp : TF_Op<"MatrixSolve", [NoSideEffect]> {
+  let summary = "Solves systems of linear equations.";
+
+  let description = [{
+`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `True` then each output matrix satisfies
+`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$matrix,
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$rhs,
+
+    DefaultValuedAttr<BoolAttr, "false">:$adjoint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MatrixTriangularSolveOp : TF_Op<"MatrixTriangularSolve", [NoSideEffect]> {
+  let summary = [{
+Solves systems of linear equations with upper or lower triangular matrices by backsubstitution.
+  }];
+
+  let description = [{
+`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+square matrices. If `lower` is `True` then the strictly upper triangular part
+of each inner-most matrix is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of each inner-most
+matrix is assumed to be zero and not accessed.
+`rhs` is a tensor of shape `[..., M, N]`.
+
+The output is a tensor of shape `[..., M, N]`. If `adjoint` is
+`True` then the innermost matrices in `output` satisfy matrix equations
+`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `False` then the strictly then the  innermost matrices in
+`output` satisfy matrix equations
+`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+
+Note, the batch shapes for the inputs only need to broadcast.
+
+Example:
+```python
+
+a = tf.constant([[3,  0,  0,  0],
+                 [2,  1,  0,  0],
+                 [1,  0,  1,  0],
+                 [1,  1,  1,  1]], dtype=tf.float32)
+
+b = tf.constant([[4],
+                 [2],
+                 [4],
+                 [2]], dtype=tf.float32)
+
+x = tf.linalg.triangular_solve(a, b, lower=True)
+x
+# <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
+# array([[ 1.3333334 ],
+#        [-0.66666675],
+#        [ 2.6666665 ],
+#        [-1.3333331 ]], dtype=float32)>
+
+# in python3 one can use `a@x`
+tf.matmul(a, x)
+# <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
+# array([[4.       ],
+#        [2.       ],
+#        [4.       ],
+#        [1.9999999]], dtype=float32)>
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$matrix,
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$rhs,
+
+    DefaultValuedAttr<BoolAttr, "true">:$lower,
+    DefaultValuedAttr<BoolAttr, "false">:$adjoint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_MaxOp : TF_Op<"Max", [NoSideEffect]> {
   let summary = [{
 Computes the maximum of elements across dimensions of a tensor.
@@ -5818,12 +6345,44 @@ def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, TF
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
+  let summary = "Computes the mean of elements across dimensions of a tensor.";
+
+  let description = [{
+Reduces `input` along the dimensions given in `axis`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`axis`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TF_I32OrI64Tensor:$reduction_indices,
+
+    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
+}
+
 def TF_MergeSummaryOp : TF_Op<"MergeSummary", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Merges summaries.";
 
   let description = [{
 This op creates a
-[`Summary`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/summary.proto)
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
 protocol buffer that contains the union of all the values in the input
 summaries.
 
@@ -6054,7 +6613,7 @@ the result here is consistent with a truncating divide. E.g.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
+def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x * y element-wise.";
 
@@ -7215,9 +7774,6 @@ def TF_RangeDatasetOp : TF_Op<"RangeDataset", []> {
 Creates a dataset with a range of values. Corresponds to python's xrange.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I64Tensor:$start,
     I64Tensor:$stop,
@@ -8111,6 +8667,47 @@ rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_RollOp : TF_Op<"Roll", [NoSideEffect]> {
+  let summary = "Rolls the elements of a tensor along an axis.";
+
+  let description = [{
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$shift,
+    TF_I32OrI64Tensor:$axis
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tshift = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Taxis = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_RoundOp : TF_Op<"Round", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = [{
 Rounds the values of a tensor to the nearest integer, element-wise.
@@ -8858,6 +9455,8 @@ size(t) ==> 12
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasFolder = 1;
 }
 
 def TF_SliceOp : TF_Op<"Slice", [NoSideEffect]> {
@@ -9464,7 +10063,7 @@ I.e., \\(y = x * x = x^2\\).
 
 def TF_SquaredDifferenceOp : TF_Op<"SquaredDifference", [Commutative, NoSideEffect, ResultsBroadcastableShape]>,
                              WithBroadcastableBinOpBuilder {
-  let summary = "Returns (x - y)(x - y) element-wise.";
+  let summary = "Returns conj(x - y)(x - y) element-wise.";
 
   let description = [{
 *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
@@ -9576,6 +10175,49 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 }
 
+def TF_StatelessMultinomialOp : TF_Op<"StatelessMultinomial", [NoSideEffect]> {
+  let summary = "Draws samples from a multinomial distribution.";
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$logits,
+    I32Tensor:$num_samples,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr output_dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatelessRandomNormalOp : TF_Op<"StatelessRandomNormal", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom values from a normal distribution.
+  }];
+
+  let description = [{
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape` and `seed`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StatelessRandomUniformOp : TF_Op<"StatelessRandomUniform", [NoSideEffect]> {
   let summary = [{
 Outputs deterministic pseudorandom random values from a uniform distribution.
@@ -9602,6 +10244,33 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_StatelessRandomUniformIntOp : TF_Op<"StatelessRandomUniformInt", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom random integers from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed,
+    TF_I32OrI64Tensor:$minval,
+    TF_I32OrI64Tensor:$maxval
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_StatelessTruncatedNormalOp : TF_Op<"StatelessTruncatedNormal", [NoSideEffect]> {
   let summary = [{
 Outputs deterministic pseudorandom values from a truncated normal distribution.
@@ -9871,7 +10540,37 @@ Examples:
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
-def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef, TF_CwiseBinary]>,
+def TF_StringToHashBucketFastOp : TF_Op<"StringToHashBucketFast", [NoSideEffect]> {
+  let summary = [{
+Converts each string in the input Tensor to its hash mod by a number of buckets.
+  }];
+
+  let description = [{
+The hash function is deterministic on the content of the string within the
+process and will never change. However, it is not suitable for cryptography.
+This function may be used when CPU time is scarce and inputs are trusted or
+unimportant. There is a risk of adversaries constructing inputs that all hash
+to the same bucket. To prevent this problem, use a strong hash function with
+`tf.string_to_hash_bucket_strong`.
+
+Examples:
+
+>>> tf.strings.to_hash_bucket_fast(["Hello", "TensorFlow", "2.x"], 3).numpy()
+array([0, 2, 2])
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$input,
+
+    Confined<I64Attr, [IntMinValue<1>]>:$num_buckets
+  );
+
+  let results = (outs
+    I64Tensor:$output
+  );
+}
+
+def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x - y element-wise.";
 
@@ -9926,6 +10625,25 @@ retained with length 1.
   >];
 }
 
+def TF_SymbolicGradientOp : TF_Op<"SymbolicGradient", [NoSideEffect]> {
+  let summary = [{
+Computes the gradient function for function f via backpropagation.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$input,
+
+    SymbolRefAttr:$f
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_TPUCompilationResultOp : TF_Op<"TPUCompilationResult", [NoSideEffect]> {
   let summary = "Returns the result of a TPU compilation.";
 
@@ -10912,43 +11630,6 @@ array([[1, 2, 3, 1, 2, 3],
   // input.rank()
 }
 
-def TF_ToBoolOp : TF_Op<"ToBool", [NoSideEffect]> {
-  let summary = "Converts a tensor to a scalar predicate.";
-
-  let description = [{
-Converts a tensor to a scalar predicate with the following rules:
-
-- For 0D tensors, truthiness is determined by comparing against a "zero"
-  value. For numerical types it is the obvious zero. For strings it is the
-  empty string.
-
-- For >0D tensors, truthiness is determined by looking at the number of
-  elements. If has zero elements, then the result is false. Otherwise the
-  result is true.
-
-This matches the behavior of If and While for determining if a tensor counts
-as true/false for a branch condition.
-  }];
-
-  let arguments = (ins
-    TF_Tensor:$input
-  );
-
-  let results = (outs
-    I1Tensor:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value value", [{
-      build(builder, result, RankedTensorType::get({}, builder.getI1Type()),
-            value);
-    }]>];
-
-  let hasCanonicalizer = 1;
-}
-
 def TF_TopKV2Op : TF_Op<"TopKV2", [NoSideEffect]> {
   let summary = [{
 Finds values and indices of the `k` largest elements for the last dimension.
@@ -11370,6 +12051,44 @@ tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
   let verifier = [{ return VerifyUnsortedSegmentReduction(*this); }];
 }
 
+def TF_UpperBoundOp : TF_Op<"UpperBound", [NoSideEffect]> {
+  let summary = [{
+Applies upper_bound(sorted_search_values, values) along each row.
+  }];
+
+  let description = [{
+Each set of rows with the same index in (sorted_inputs, values) is treated
+independently.  The resulting row is the equivalent of calling
+`np.searchsorted(sorted_inputs, values, side='right')`.
+
+The result is not a global index to the entire
+`Tensor`, but rather just the index in the last dimension.
+
+A 2-D example:
+  sorted_sequence = [[0, 3, 9, 9, 10],
+                     [1, 2, 3, 4, 5]]
+  values = [[2, 4, 9],
+            [0, 2, 6]]
+
+  result = UpperBound(sorted_sequence, values)
+
+  result == [[1, 2, 4],
+             [0, 2, 5]]
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$sorted_inputs,
+    TF_Tensor:$values
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr out_type = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_VarIsInitializedOp : TF_Op<"VarIsInitializedOp", []> {
   let summary = [{
 Checks whether a resource handle-based variable has been initialized.
@@ -11901,6 +12620,13 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
 def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
   let summary = "An op to receive a tensor from the host.";
 
+  let description = [{
+output: the tensor that will be received from the host.
+Toutput: element type for output.
+shape: shape for output.
+key: A unique identifier for this region used to match up host transfers.
+  }];
+
   let arguments = (ins
     TF_ShapeAttr:$shape,
     StrAttr:$key
@@ -11945,6 +12671,31 @@ def TF_XlaReplicaIdOp : TF_Op<"XlaReplicaId", [NoSideEffect]> {
   );
 }
 
+def TF_XlaScatterOp : TF_Op<"XlaScatter", [NoSideEffect]> {
+  let summary = "Wraps the XLA Scatter operator documented at";
+
+  let description = [{
+https://www.tensorflow.org/xla/operation_semantics#scatter.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$operand,
+    TF_I32OrI64Tensor:$scatter_indices,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates,
+
+    SymbolRefAttr:$update_computation,
+    StrAttr:$dimension_numbers,
+    BoolAttr:$indices_are_sorted
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSelfAdjointEigOp : TF_Op<"XlaSelfAdjointEig", [NoSideEffect]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -11977,6 +12728,12 @@ i=0...N-1.
 def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
   let summary = "An op to send a tensor to the host.";
 
+  let description = [{
+input: the tensor that will be sent to the host.
+Tinput: element type for input.
+key: A unique identifier for this region used to match up host transfers.
+  }];
+
   let arguments = (ins
     TF_Tensor:$input,
 
@@ -12062,6 +12819,43 @@ def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF__FusedBatchNormExOp : TF_Op<"_FusedBatchNormEx", [NoSideEffect]> {
+  let summary = "Internal FusedBatchNorm operation: reserved for internal use.";
+
+  let description = [{
+Do not invoke this operator directly in Python. A fusion optimization is
+expected to create these operators.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+    Variadic<TensorOf<[F16, F32]>>:$side_input,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<StrAttr, "Identity">:$activation_mode,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2,
+    F32Tensor:$reserve_space_3
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandSizeAttr num_side_inputs = TF_DerivedOperandSizeAttr<5>;
+}
+
 def TF__FusedConv2DOp : TF_Op<"_FusedConv2D", [NoSideEffect]> {
   let summary = [{
 Performs a convolution followed by a specified series of operations.
@@ -12183,18 +12977,17 @@ Compiles a computations for execution on one or more TPU devices.
   }];
 
   let description = [{
-For the internal use of the distributed TPU compiler. Note that currently only
-single TPU device is supported.
+For the internal use of the distributed TPU compiler.
 
 'mlir_module' is a serialized MLIR module with a `main` function that contains
 target computation.
 'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
 known statically at TPUReplication rewrite time.
-'metadata' is a serialized TPUCompileMetadataProto describing
-the shapes and types of the inputs to the computation, as well as a mapping onto
-the TPU pod topology.
-'program' output is a string key that is passed to the _TPUExecute op and
-used to look up the program in the compilation cache.
+'metadata' is a serialized TPUCompileMetadataProto describing the shapes and
+types of the inputs to the computation, as well as a mapping onto the TPU pod
+topology.
+'program' output is a string key that is passed to the TPUExecute op and used to
+look up the program in the compilation cache.
   }];
 
   let arguments = (ins
@@ -12231,6 +13024,28 @@ rewrite passes must replace this op with a _TPUCompileMlir op `program` output.
   );
 }
 
+def TF__UnaryOpsCompositionOp : TF_Op<"_UnaryOpsComposition", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = [{
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+  }];
+
+  let description = [{
+expected to create these operators.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64]>:$x,
+
+    StrArrayAttr:$op_names
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64]>:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF__XlaHostComputeMlirOp : TF_Op<"_XlaHostComputeMlir", []> {
   let summary = [{
 A pseudo-op to represent host-side computation in an XLA program.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index e35e5dc40a8..737442d5f8c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -55,6 +55,8 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/DecodeAttributesInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/FoldInterfaces.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -112,6 +114,22 @@ bool HasSingleUse(FuncOp func) {
   return true;
 }
 
+struct TFConstantFoldInterface : public DialectFoldInterface {
+  TFConstantFoldInterface(Dialect *dialect) : DialectFoldInterface(dialect) {}
+  LogicalResult fold(Operation *op, ArrayRef<Attribute> operands,
+                     SmallVectorImpl<OpFoldResult> &results) const final {
+    return TensorFlowDialect::constantFold(op, operands, results);
+  }
+};
+
+struct TFDecodeAttributesInterface : public DialectDecodeAttributesInterface {
+  TFDecodeAttributesInterface(Dialect *dialect)
+      : DialectDecodeAttributesInterface(dialect) {}
+  LogicalResult decode(OpaqueElementsAttr input, ElementsAttr &output) const {
+    return TensorFlowDialect::decode(input, output);
+  }
+};
+
 struct TFInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
@@ -206,6 +224,9 @@ std::vector<TensorFlowDialect::AdditionalOpFunction>
     *TensorFlowDialect::additional_operation_hooks_ =
         new std::vector<TensorFlowDialect::AdditionalOpFunction>();
 
+TensorFlowDialect::ConstantFoldHook TensorFlowDialect::constant_fold_hook_;
+TensorFlowDialect::DecodeConstantHook TensorFlowDialect::decode_constant_hook_;
+
 TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
     : Dialect(/*name=*/"tf", context, TypeID::get<TensorFlowDialect>()) {
   addOperations<
@@ -217,7 +238,8 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
 #define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
       >();
-  addInterfaces<TFInlinerInterface>();
+  addInterfaces<TFInlinerInterface, TFDecodeAttributesInterface,
+                TFConstantFoldInterface>();
   addAttributes<ShapeAttr, FuncAttr>();
 
   // Support unknown operations because not all TensorFlow operations are
@@ -336,16 +358,12 @@ Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
 
 void TensorFlowDialect::printAttribute(Attribute attr,
                                        DialectAsmPrinter &os) const {
-  switch (attr.getKind()) {
-    case AttrKind::SHAPE:
-      PrintShapeAttr(attr.cast<ShapeAttr>(), os);
-      break;
-    case AttrKind::FUNC:
-      PrintFuncAttr(attr.cast<FuncAttr>(), os);
-      break;
-    default:
-      llvm_unreachable("unexpected tensorflow attribute kind");
-  }
+  if (auto shape_attr = attr.dyn_cast<ShapeAttr>())
+    PrintShapeAttr(shape_attr, os);
+  else if (auto func_attr = attr.dyn_cast<FuncAttr>())
+    PrintFuncAttr(func_attr, os);
+  else
+    llvm_unreachable("unexpected tensorflow attribute type");
 }
 
 // Parses a type registered to this dialect.
@@ -354,51 +372,37 @@ Type TensorFlowDialect::parseType(DialectAsmParser &parser) const {
   if (parser.parseKeyword(&data)) return Type();
 
   Location loc = parser.getEncodedSourceLoc(parser.getNameLoc());
-  auto typeKind = llvm::StringSwitch<unsigned>(data)
+
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  .Case(name, TensorFlowTypes::enumerant)
+  if (data == name) return tftype##Type::get(getContext());
 // Custom TensorFlow types are handled separately at the end as they do partial
 // match.
 #define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-                      .StartsWith("resource", TensorFlowTypes::RESOURCE)
-                      .StartsWith("variant", TensorFlowTypes::VARIANT)
-                      .Default(0);
-  switch (typeKind) {
-    default:
-      return (emitError(loc, "unknown TensorFlow type: " + data), nullptr);
 
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:              \
-    return tftype##Type::get(getContext());
-#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)
-// NOLINTNEXTLINE
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    case TensorFlowTypes::RESOURCE:
-      return ParseResourceType(parser, loc);
-    case TensorFlowTypes::VARIANT:
-      return ParseVariantType(parser, loc);
-  }
+  if (data.startswith("resource")) return ParseResourceType(parser, loc);
+  if (data.startswith("variant")) return ParseVariantType(parser, loc);
+  return (emitError(loc, "unknown TensorFlow type: " + data), nullptr);
 }
 
 // Prints a type registered to this dialect.
 void TensorFlowDialect::printType(Type ty, DialectAsmPrinter &os) const {
   assert(ty.isa<TensorFlowType>());
-  switch (ty.getKind()) {
-    default:
-      llvm_unreachable("unexpected tensorflow type kind");
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:              \
-    os << name;                                 \
-    break;
+#define HANDLE_TF_TYPE(tftype, enumerant, name)        \
+  if (auto derived_ty = ty.dyn_cast<tftype##Type>()) { \
+    os << name;                                        \
+    return;                                            \
+  }
 #define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:                     \
-    Print##tftype##Type(ty.cast<tftype##Type>(), os);  \
-    break;
+  if (auto derived_ty = ty.dyn_cast<tftype##Type>()) { \
+    Print##tftype##Type(derived_ty, os);               \
+    return;                                            \
+  }
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-  }
+
+  llvm_unreachable("unexpected tensorflow type kind");
 }
 
 namespace {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index bbcce4ee177..3169f7fba8d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -116,10 +116,35 @@ class TensorFlowDialect : public Dialect {
         0, (addOperation(AbstractOperation::get<Args>(*this)), 0)...};
   }
 
+  using ConstantFoldHook = LogicalResult (*)(Operation *, ArrayRef<Attribute>,
+                                             SmallVectorImpl<OpFoldResult> &);
+  static void RegisterConstantFoldHook(ConstantFoldHook fn) {
+    constant_fold_hook_ = std::move(fn);
+  }
+
+  static LogicalResult constantFold(Operation *op, ArrayRef<Attribute> operands,
+                                    SmallVectorImpl<OpFoldResult> &results) {
+    if (constant_fold_hook_) return constant_fold_hook_(op, operands, results);
+    return failure();
+  }
+
+  using DecodeConstantHook = LogicalResult (*)(OpaqueElementsAttr input,
+                                               ElementsAttr &output);
+  static void RegisterDecodeConstantHook(DecodeConstantHook fn) {
+    decode_constant_hook_ = std::move(fn);
+  }
+  static LogicalResult decode(OpaqueElementsAttr input, ElementsAttr &output) {
+    if (decode_constant_hook_) return decode_constant_hook_(input, output);
+    return failure();
+  }
+
  private:
   // Hook functions which may add additional operations to the dialect.
   // These are invoked at construction time.
   static std::vector<AdditionalOpFunction> *additional_operation_hooks_;
+
+  static ConstantFoldHook constant_fold_hook_;
+  static DecodeConstantHook decode_constant_hook_;
 };
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 5269bb82239..db0a97d4b96 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -97,10 +97,10 @@ An n-way switch statement, implementing the following:
     Variadic<TF_Tensor>:$input,
 
     Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
 
-    // Used to map StatelessCase and Case to a common op.
-    DefaultValuedAttr<BoolAttr, "false">:$is_stateless
+    // Used to map StatelessCase and Case op defined in TensorFlow to a common
+    // op.
+    BoolAttr:$is_stateless
   );
 
   let results = (outs
@@ -109,8 +109,57 @@ An n-way switch statement, implementing the following:
 
   TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
   TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+  TF_DerivedResultShapeListAttr output_shapes = TF_DerivedResultShapeListAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_CaseRegionOp : TF_Op<"CaseRegion",
+      [SingleBlockImplicitTerminator<"YieldOp">, NoRegionArguments]> {
+  let summary = [{
+An n-way switch statement which calls a single branch function.
+  }];
+
+  let description = [{
+An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    I32Tensor:$branch_index,
+
+    // Used to map StatelessCase and Case op defined in TensorFlow to a common
+    // op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
@@ -168,30 +217,6 @@ source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-
-def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Permute input tensor from `src_format` to `dst_format`";
-
-  let description = [{
-Input tensor must be a vector of size 4, or a 4x2 tensor.
-  }];
-
-  let arguments = (ins
-    TF_I32OrI64Tensor:$x,
-
-    DefaultValuedAttr<StrAttr, "NHWC">:$src_format,
-    DefaultValuedAttr<StrAttr, "NCHW">:$dst_format
-  );
-
-  let results = (outs
-    TF_I32OrI64Tensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let verifier = [{ return Verify(*this); }];
-}
-
 def TF_EmptyTensorListOp : TF_TensorListInitOp<"EmptyTensorList"> {
   let summary = "Creates and returns an empty tensor list.";
 
@@ -292,7 +317,7 @@ else_branch: A function that takes 'inputs' and returns a list of
 }
 
 def TF_YieldOp : TF_Op<"Yield",
-      [Terminator, ParentOneOf<["IfRegionOp", "WhileRegionOp"]>]> {
+      [Terminator, ParentOneOf<["CaseRegionOp", "IfRegionOp", "WhileRegionOp"]>]> {
   let summary = "Yield operation";
 
   let description = [{
@@ -328,7 +353,7 @@ else_branch: A region that computes the outputs of the op if cond = false.
   }];
 
   let arguments = (ins
-    TF_Tensor:$cond,
+    0DTensorOf<[I1]>:$cond,
 
     // Used to map StatelessIf and If op defined in TensorFlow to a common op.
     BoolAttr:$is_stateless
@@ -338,47 +363,13 @@ else_branch: A region that computes the outputs of the op if cond = false.
     Variadic<TF_Tensor>:$output
   );
 
-  TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
-  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
-
   let regions = (region SizedRegion<1>:$then_branch, SizedRegion<1>:$else_branch);
 
   let verifier = [{
     return Verify(*this);
   }];
-}
 
-def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
-  let summary = "Computes the mean of elements across dimensions of a tensor.";
-
-  let description = [{
-Reduces `input` along the dimensions given in `axis`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`axis`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-  }];
-
-  let arguments = (ins
-    TF_NumberTensor:$input,
-    TF_I32OrI64Tensor:$reduction_indices,
-
-    DefaultValuedAttr<BoolAttr, "false">:$keep_dims
-  );
-
-  let results = (outs
-    TF_NumberTensor:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
-
-  let extraClassDeclaration = [{
-    // TF_FoldOperandsTransposeInterface:
-    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {}; }
-    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
-  }];
+  let hasCanonicalizer = 1;
 }
 
 def TF_LegacyCallOp : TF_Op<"LegacyCall",
@@ -755,8 +746,6 @@ def TL_WhileRegionOp : TF_Op<"WhileRegion",
   );
   let results = (outs Variadic<AnyTensor>:$output);
 
-  TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>;
-
   let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
 
   let verifier = [{ return Verify(*this); }];
@@ -841,45 +830,6 @@ Example:
     TF_DerivedOperandOrResultHandleShapeAttr<"resource">;
 }
 
-// Not generated because it begins with an underscore, which isn't allowed by
-// the C++ standard.
-def TF_FusedBatchNormExOp : TF_Op<"_FusedBatchNormEx", [NoSideEffect]> {
-  let summary = "Internal FusedBatchNorm operation: reserved for internal use";
-
-  let description = [{
- Do not invoke this operator directly in Python. A fusion optimization is
- expected to create these operators.
-  }];
-
-  let arguments = (ins
-    TensorOf<[F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
-    Variadic<TensorOf<[F16, F32]>>:$side_input,
-
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
-    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
-    DefaultValuedAttr<StrAttr, "Identity">:$activation_mode,
-    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
-    DefaultValuedAttr<BoolAttr, "true">:$is_training
-  );
-
-  let results = (outs
-    TensorOf<[F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
-  TF_DerivedOperandSizeAttr num_side_inputs = TF_DerivedOperandSizeAttr<5>;
-}
-
 // Multiple variadic operands with different sizes are not supported by the
 // dialect generator, so we manually added the op.
 def TF_SendTPUEmbeddingGradientsOp : TF_Op<"SendTPUEmbeddingGradients", [AttrSizedOperandSegments]> {
@@ -1150,6 +1100,43 @@ def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
   TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
 }
 
+def TF_ToBoolOp : TF_Op<"ToBool", [NoSideEffect]> {
+  let summary = "Converts a tensor to a scalar predicate.";
+
+  let description = [{
+Converts a tensor to a scalar predicate with the following rules:
+
+- For 0D tensors, truthiness is determined by comparing against a "zero"
+  value. For numerical types it is the obvious zero. For strings it is the
+  empty string.
+
+- For >0D tensors, truthiness is determined by looking at the number of
+  elements. If has zero elements, then the result is false. Otherwise the
+  result is true.
+
+This matches the behavior of If and While for determining if a tensor counts
+as true/false for a branch condition.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input
+  );
+
+  let results = (outs
+    0DTensorOf<[I1]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value value", [{
+      build(builder, result, RankedTensorType::get({}, builder.getI1Type()),
+            value);
+    }]>];
+
+  let hasCanonicalizer = 1;
+}
+
 def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Bessel i0e function of `x` element-wise.";
 
@@ -1192,36 +1179,6 @@ This function is faster and numerically stabler than `bessel_i1(x)`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_StringToHashBucketFastOp : TF_Op<"StringToHashBucketFast", [NoSideEffect]> {
-  let summary = [{
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-  }];
-
-  let description = [{
-The hash function is deterministic on the content of the string within the
-process and will never change. However, it is not suitable for cryptography.
-This function may be used when CPU time is scarce and inputs are trusted or
-unimportant. There is a risk of adversaries constructing inputs that all hash
-to the same bucket. To prevent this problem, use a strong hash function with
-`tf.string_to_hash_bucket_strong`.
-
-Examples:
-
->>> tf.strings.to_hash_bucket_fast(["Hello", "TensorFlow", "2.x"], 3).numpy()
-array([0, 2, 2])
-  }];
-
-  let arguments = (ins
-    TF_StrTensor:$input,
-
-    Confined<I64Attr, [IntMinValue<1>]>:$num_buckets
-  );
-
-  let results = (outs
-    I64Tensor:$output
-  );
-}
-
 def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface]> {
   let summary = "Calls a function placed on a specified TPU device.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 1a730a38618..b465c1da68c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <limits>
 #include <numeric>
 #include <string>
@@ -37,6 +38,7 @@ limitations under the License.
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -64,6 +66,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -438,6 +441,19 @@ static LogicalResult Verify(BroadcastToOp op) {
   return success();
 }
 
+OpFoldResult BroadcastToOp::fold(ArrayRef<Attribute> operands) {
+  Value input = this->input();
+
+  // Fold broadcast if operand and result types are the same and all dimensions
+  // are statically known (no-op broadcast).
+  auto result_ty = getType().dyn_cast<ShapedType>();
+  if (result_ty && result_ty.hasStaticShape() && result_ty == input.getType()) {
+    return input;
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // CaseOp
 //===----------------------------------------------------------------------===//
@@ -456,28 +472,139 @@ LogicalResult FoldConstantCaseOp::matchAndRewrite(
   DenseIntElementsAttr branch;
   if (!matchPattern(op.branch_index(), m_Constant(&branch))) return failure();
 
-  // Only attempt to fold scalar valued case statements.
-  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
-  if (!branch.getType().cast<RankedTensorType>().getShape().empty())
-    return failure();
-
   int index = *branch.getValues<int>().begin();
-  // TODO(jpienaar): This can be removed if CaseOp's verifier covers it.
-  if (index >= op.branches().size()) return failure();
+  if (index < 0 || index >= op.branches().size())
+    index = op.branches().size() - 1;
 
   auto func = op.branches()[index].cast<SymbolRefAttr>();
   auto empty = rewriter.getStringAttr("");
   auto call_op = rewriter.create<PartitionedCallOp>(
       op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
       /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
-  PropagateDeviceAndInternalAttrs(op.getOperation(), call_op);
+  CopyDeviceAndUnderscoredAttributes(op.getOperation(), call_op);
   rewriter.replaceOp(op, call_op.getResults());
   return success();
 }
 
 void CaseOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                          MLIRContext *context) {
-  results.insert<FoldConstantCaseOp>(context);
+  results.insert<FoldConstantCaseOp, DropAttributes<CaseOp>>(context);
+}
+
+static LogicalResult VerifyCaseOpBase(Operation *op, Value branch_index) {
+  if (!IsOfRankOrUnranked(branch_index, 0))
+    return op->emitOpError()
+           << "expects 'branch_index' to be a scalar, but got "
+           << branch_index.getType();
+  return success();
+}
+
+static LogicalResult VerifyCaseOrIfOpBranchFunctions(
+    Operation *op, ArrayRef<Attribute> branches,
+    llvm::function_ref<std::string(unsigned branch_index)> branch_name) {
+  SmallVector<FunctionType, 2> branch_types;
+  branch_types.reserve(branches.size());
+
+  // Functions have one less operand compared to op as first operand is elided
+  // (`cond` of `tf.If` and `branch_index` of `tf.Case`).
+  int expected_num_inputs = op->getNumOperands() - 1;
+  int expected_num_results = op->getNumResults();
+  for (auto branch : llvm::enumerate(branches)) {
+    auto branch_func = SymbolTable::lookupNearestSymbolFrom<FuncOp>(
+        op, branch.value().cast<SymbolRefAttr>());
+    if (!branch_func)
+      return op->emitOpError()
+             << "expects " << branch_name(branch.index()) << " ("
+             << branch.value() << ") to point to a defined function";
+
+    FunctionType branch_type = branch_func.getType();
+    if (branch_type.getNumInputs() != expected_num_inputs)
+      return op->emitOpError()
+             << "expects all branches to have " << expected_num_inputs
+             << " input(s), but " << branch_name(branch.index()) << " has "
+             << branch_type.getNumInputs() << " input(s)";
+
+    if (branch_type.getNumResults() != expected_num_results)
+      return op->emitOpError()
+             << "expects all branches to have " << expected_num_results
+             << " result(s), but " << branch_name(branch.index()) << " has "
+             << branch_type.getNumResults() << " result(s)";
+
+    // Non-conditional operands starting with the second operand are passed to
+    // branches and should be compatible across all branches' inputs.
+    for (auto operand_type :
+         llvm::enumerate(llvm::drop_begin(op->getOperandTypes(), 1))) {
+      Type branch_input_i_type = branch_type.getInput(operand_type.index());
+      if (!AreCastCompatible({operand_type.value(), branch_input_i_type}))
+        return op->emitOpError()
+               << "expects operand type " << operand_type.value()
+               << " to be cast compatible with " << branch_name(branch.index())
+               << " input type " << branch_input_i_type << " at index "
+               << operand_type.index();
+    }
+
+    // Branches' results should be pair-wise compatible with the op results.
+    for (auto result_type : llvm::enumerate(op->getResultTypes())) {
+      Type branch_result_i_type = branch_type.getResult(result_type.index());
+      if (!AreCastCompatible({result_type.value(), branch_result_i_type}))
+        return op->emitOpError()
+               << "expects result type " << result_type.value()
+               << " to be cast compatible with " << branch_name(branch.index())
+               << " result type " << branch_result_i_type << " at index "
+               << result_type.index();
+    }
+
+    branch_types.push_back(branch_type);
+  }
+
+  // If branches have incompatible input types that means that no tensor can
+  // serve as input to all the functions. Hence, the op is invalid.
+  for (int i = 0; i < expected_num_inputs; ++i) {
+    SmallVector<Type, 2> branch_input_i_types;
+    branch_input_i_types.reserve(branches.size());
+    llvm::transform(
+        branch_types, std::back_inserter(branch_input_i_types),
+        [i](FunctionType &branch_type) { return branch_type.getInput(i); });
+    if (!AreCastCompatible(branch_input_i_types)) {
+      std::string input_types_str;
+      llvm::raw_string_ostream os(input_types_str);
+      llvm::interleaveComma(branch_input_i_types, os);
+      return op->emitOpError()
+             << "expects all branch input type(s) (" << os.str()
+             << ") at index " << i << " to be cast compatible";
+    }
+  }
+
+  return success();
+}
+
+static LogicalResult Verify(CaseOp op) {
+  if (failed(VerifyCaseOpBase(op, op.branch_index()))) return failure();
+  auto branch_name = [](unsigned index) {
+    return llvm::formatv("branch #{0}", index).str();
+  };
+  return VerifyCaseOrIfOpBranchFunctions(op, op.branches().getValue(),
+                                         branch_name);
+}
+
+//===----------------------------------------------------------------------===//
+// CaseRegionOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(CaseRegionOp op) {
+  if (op.branches().empty())
+    return op.emitOpError() << "expects to have at least 1 region";
+
+  if (failed(VerifyCaseOpBase(op, op.branch_index()))) return failure();
+
+  for (auto region_and_idx : llvm::enumerate(op.branches())) {
+    std::string region_name =
+        llvm::formatv("region #{0}", region_and_idx.index()).str();
+    if (failed(VerifyRegionResults(op, region_and_idx.value(), region_name)))
+      return failure();
+  }
+
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -734,6 +861,35 @@ void ConcatV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results,
       context);
 }
 
+//===----------------------------------------------------------------------===//
+// CumsumOp and CumprodOp
+//===----------------------------------------------------------------------===//
+
+template <typename OpT, typename std::enable_if<llvm::is_one_of<
+                            OpT, CumsumOp, CumprodOp>::value>::type * = nullptr>
+static LogicalResult Verify(OpT op) {
+  if (!IsOfRankOrUnranked(op.axis(), 0))
+    return op.emitOpError("requires scalar axis operand");
+
+  DenseIntElementsAttr axis_attr;
+  if (matchPattern(op.axis(), m_Constant(&axis_attr))) {
+    auto input_ty = op.x().getType().template dyn_cast<RankedTensorType>();
+    if (input_ty) {
+      int64_t rank = input_ty.getRank();
+      assert(axis_attr.getNumElements() == 1 &&
+             "scalar attribute should have exactly one element");
+      int64_t axis = (*axis_attr.begin()).getSExtValue();
+      if (axis < -rank || axis >= rank) {
+        return op.emitError()
+               << "axis operand should be within range [" << -rank << ", "
+               << rank << "); actual value: " << axis;
+      }
+    }
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ConcatOffsetOp
 //===----------------------------------------------------------------------===//
@@ -1768,79 +1924,18 @@ static LogicalResult Verify(GatherV2Op op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(IfOp op) {
-  auto then_fn = op.then_func();
-  if (!then_fn)
-    return op.emitOpError("then_branch refers to an undefined function : ")
-           << op.then_branch();
-  auto else_fn = op.else_func();
-  if (!else_fn)
-    return op.emitOpError("else_branch refers to an undefined function : ")
-           << op.else_branch();
-  auto then_fn_type = then_fn.getType();
-  auto else_fn_type = else_fn.getType();
-
-  // Non-conditional operands starting with the second operand are passed to
-  // branches and should be pair-wise compatible with branches' inputs.
-  unsigned expected_num_inputs = op.getNumOperands() - 1;
-  if (then_fn_type.getNumInputs() != expected_num_inputs ||
-      else_fn_type.getNumInputs() != expected_num_inputs)
-    return op.emitError("branches should have " + Twine(expected_num_inputs) +
-                        " inputs");
-
-  for (unsigned i = 0; i < expected_num_inputs; ++i) {
-    auto operand_type = op.getOperand(i + 1).getType().cast<TensorType>();
-    auto then_input_type = then_fn_type.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operand_type, then_input_type}))
-      return op.emitError(
-          llvm::formatv("then branch input type {0} is incompatible with "
-                        "operand type {1} at index {2}",
-                        then_input_type, operand_type, i));
-
-    auto else_input_type = else_fn_type.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operand_type, else_input_type}))
-      return op.emitError(
-          llvm::formatv("else branch input type {0} is incompatible with "
-                        "operand type {1} at index {2}",
-                        else_input_type, operand_type, i));
-
-    // If branches have incompatible input types that means that no tensor can
-    // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible({then_input_type, else_input_type}))
-      return op.emitError(llvm::formatv(
-          "branches inputs have incompatible types {0} and {1} at index {2}",
-          then_input_type, else_input_type, i));
-  }
-
-  // Branches' results should be pair-wise compatible with the op results.
-  unsigned expected_num_results = op.getNumResults();
-  if (then_fn_type.getNumResults() != expected_num_results ||
-      else_fn_type.getNumResults() != expected_num_results)
-    return op.emitError("branches should have " + Twine(expected_num_results) +
-                        " results");
-
-  for (unsigned i = 0; i < expected_num_results; ++i) {
-    auto result_type = op.getResult(i).getType().cast<TensorType>();
-    auto then_result_type = then_fn_type.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({then_result_type, result_type}))
-      return op.emitError(
-          llvm::formatv("then branch result type {0} is incompatible with op "
-                        "result type {1} at index {2}",
-                        then_result_type, result_type, i));
-
-    auto else_result_type = else_fn_type.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({else_result_type, result_type}))
-      return op.emitError(
-          llvm::formatv("else branch result type {0} is incompatible with op "
-                        "result type {1} at index {2}",
-                        else_result_type, result_type, i));
-  }
-  return success();
+  auto branch_name = [](unsigned index) -> std::string {
+    return index == 0 ? "'then_branch'" : "'else_branch'";
+  };
+  return VerifyCaseOrIfOpBranchFunctions(
+      op, {op.then_branchAttr(), op.else_branchAttr()}, branch_name);
 }
 
 //===----------------------------------------------------------------------===//
 // IfOp canonicalization.
 //===----------------------------------------------------------------------===//
 
+namespace {
 class FoldConstantIfOp : public OpRewritePattern<TF::IfOp> {
  public:
   explicit FoldConstantIfOp(MLIRContext *context)
@@ -1872,9 +1967,9 @@ LogicalResult FoldConstantIfOp::matchAndRewrite(
   auto rewrite = [&](auto op_type) {
     auto empty = rewriter.getStringAttr("");
     auto call_op = rewriter.create<typename decltype(op_type)::CallOp>(
-        op.getLoc(), op.getResultTypes(), op.getOperands().drop_front(), func,
+        op.getLoc(), op.getResultTypes(), op.input(), func,
         /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
-    PropagateDeviceAndInternalAttrs(op.getOperation(), call_op);
+    CopyDeviceAndUnderscoredAttributes(op.getOperation(), call_op);
     rewriter.replaceOp(op, call_op.getResults());
   };
 
@@ -1885,6 +1980,7 @@ LogicalResult FoldConstantIfOp::matchAndRewrite(
 
   return success();
 }
+}  // anonymous namespace
 
 void IfOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                        MLIRContext *context) {
@@ -1903,6 +1999,61 @@ static LogicalResult Verify(IfRegionOp op) {
   return success();
 }
 
+namespace {
+class FoldConstantIfRegionOp : public OpRewritePattern<TF::IfRegionOp> {
+ public:
+  explicit FoldConstantIfRegionOp(MLIRContext *context)
+      : OpRewritePattern<TF::IfRegionOp>(context) {}
+  LogicalResult matchAndRewrite(TF::IfRegionOp op,
+                                PatternRewriter &rewriter) const override;
+};
+
+LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
+    TF::IfRegionOp op, PatternRewriter &rewriter) const {
+  // Extract the constant cond value.
+  DenseIntElementsAttr cond_attr;
+  if (!matchPattern(op.cond(), m_Constant(&cond_attr))) return failure();
+
+  // IfRegion condition should always be a scalar. Select the region to fold to.
+  bool cond = cond_attr.getSplatValue<BoolAttr>().getValue();
+  Region &region = cond ? op.then_branch() : op.else_branch();
+
+  // If the IfRegion is stateless but the region being inlined itself is not
+  // stateless, then inlining the region could cause a loss of information.
+  // However, its probably better to fold the IfRegion instead of having the
+  // dead branch stay.
+
+  // Inline the region in place of the IfRegion op, and forward the yield
+  // inputs to the IfRegion op results. This is possible only if the yield
+  // types match the result types.
+  auto yield = cast<YieldOp>(region.front().getTerminator());
+  auto updated_results = llvm::to_vector<4>(yield.getOperands());
+
+  // If the yield types do not match the IfRegion result types, add appropriate
+  // casts.
+  rewriter.setInsertionPoint(yield);
+  for (auto it : llvm::zip(op.getResultTypes(), updated_results)) {
+    auto &updated_result = std::get<1>(it);
+    Type result_type = std::get<0>(it);
+    if (result_type != updated_result.getType()) {
+      updated_result =
+          rewriter.create<TF::CastOp>(op.getLoc(), result_type, updated_result,
+                                      /*Truncate=*/rewriter.getBoolAttr(false));
+    }
+  }
+  // Inline the region into the block containing the IfRegion.
+  rewriter.mergeBlockBefore(&region.front(), op);
+  rewriter.eraseOp(yield);
+  rewriter.replaceOp(op, updated_results);
+  return success();
+}
+}  // anonymous namespace
+
+void IfRegionOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                             MLIRContext *context) {
+  results.insert<FoldConstantIfRegionOp>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // InvertOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
index 71f1560aa6c..bb7d9a50521 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
@@ -18,17 +18,6 @@ limitations under the License.
 // tf_verifiers or tf_ops.
 // TODO(jpienaar): Remove this file post refactoring.
 
-// Propagates underscore and device attributes from src to dst.
-// TODO(b/158769932): This should be a general feature instead post some policy
-// discussion.
-static void PropagateDeviceAndInternalAttrs(Operation *src, Operation *dst) {
-  auto device = mlir::Identifier::get("device", src->getContext());
-  for (auto named_attr : src->getAttrs()) {
-    if (*named_attr.first.begin() == '_' || named_attr.first == device)
-      dst->setAttr(named_attr.first, named_attr.second);
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // TF op helper functions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 887473efbea..cbac03f80f8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -707,7 +707,6 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
 
   // Fold reshape if operand and result types are the same and all dimensions
   // are statically known (no-op reshape).
-  // TODO(ezhulenev): Add the same folding for BroadcastToOp.
   auto result_ty = getType().dyn_cast<ShapedType>();
   if (result_ty && result_ty.hasStaticShape() &&
       result_ty == tensor.getType()) {
@@ -1015,9 +1014,23 @@ static LogicalResult Verify(SizeOp op) {
     return op.emitOpError(
         "requires ranked input tensor to be of rank INT32_MAX or less");
 
+  // Output type needs to be scalar.
+  if (!IsOfRankOrUnranked(op.output(), /*rank=*/0))
+    return op.emitOpError("requires scalar output");
+
   return success();
 }
 
+OpFoldResult SizeOp::fold(ArrayRef<Attribute> operands) {
+  ShapedType output_type = getType().cast<ShapedType>();
+  ShapedType input_type = getOperand().getType().cast<ShapedType>();
+  if (!input_type.hasStaticShape()) return {};
+  int size = input_type.getNumElements();
+  return DenseElementsAttr::get(
+      output_type,
+      IntegerAttr::get(output_type.getElementType(), /*value=*/size));
+}
+
 //===----------------------------------------------------------------------===//
 // SliceOp
 //===----------------------------------------------------------------------===//
@@ -1783,26 +1796,57 @@ static LogicalResult Verify(TopKV2Op op) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-// If the input to ToBoolOp is a `tensor<i1>`, then the ToBoolOp is an identity
-// function and can be removed.
-class ToBoolOfZeroDBoolTensor : public OpRewritePattern<ToBoolOp> {
+// If the input to ToBoolOp is a ranked tensor, then the ToBoolOp can be folded
+// into an identity or an equality comparison.
+class ToBoolOfRankedTensor : public OpRewritePattern<ToBoolOp> {
   using OpRewritePattern<ToBoolOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(ToBoolOp op,
                                 PatternRewriter &rewriter) const override {
-    if (auto type = op.getOperand().getType().dyn_cast<RankedTensorType>()) {
-      if (type.getRank() == 0 && type.getElementType().isInteger(1)) {
-        rewriter.replaceOp(op, op.getOperand());
-        return success();
-      }
+    auto type = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    // If the input is an unranked tensor, cannpt rewrite.
+    if (!type) return failure();
+
+    // Expected return type of the ToBool operation.
+    auto result_type = op.getResult().getType().cast<RankedTensorType>();
+
+    // If input is already a tensor<i1>, it can be folded into an identity.
+    if (type == result_type) {
+      rewriter.replaceOp(op, op.getOperand());
+      return success();
     }
-    return failure();
+
+    if (type.getRank() == 0) {
+      // If the input is a scalar tensor, the ToBool can be expanded to
+      // element != 0 (for numerical values) or element == empty (for string).
+      Type element_type = type.getElementType();
+      Attribute zero_attr;
+      if (element_type.isIntOrFloat())
+        zero_attr = rewriter.getZeroAttr(type);
+      else if (element_type.isa<TF::StringType>())
+        zero_attr = DenseStringElementsAttr::get(type, {""});
+
+      if (!zero_attr) return failure();
+
+      auto zero_const = rewriter.create<TF::ConstOp>(op.getLoc(), zero_attr);
+      rewriter.replaceOpWithNewOp<TF::NotEqualOp>(
+          op, result_type, op.getOperand(), zero_const, false);
+    } else {
+      // If the input is a non-scalar ranked tensor, ToBool can be expanded
+      // to numElements != 0. numElements will be 0 iff one of the dimensions is
+      // zero.
+      bool any_zero =
+          llvm::any_of(type.getShape(), [](int64_t dim) { return dim == 0; });
+      rewriter.replaceOpWithNewOp<TF::ConstOp>(
+          op, result_type, DenseElementsAttr::get(result_type, {!any_zero}));
+    }
+    return success();
   }
 };
 }  // namespace
 
 void ToBoolOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                            MLIRContext *context) {
-  results.insert<ToBoolOfZeroDBoolTensor>(context);
+  results.insert<ToBoolOfRankedTensor>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1895,11 +1939,9 @@ void TransposeOp::build(OpBuilder &builder, OperationState &result, Value x,
 namespace {
 
 OpFoldResult FoldIdentityTranspose(TransposeOp op) {
-  auto const_perm = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
-  if (!const_perm) return {};
-
-  auto const_value = const_perm.value();
-  const auto elements = const_value.getValues<APInt>();
+  DenseIntElementsAttr perm;
+  if (!matchPattern(op.perm(), m_Constant(&perm))) return {};
+  const auto elements = perm.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
@@ -1922,14 +1964,14 @@ OpFoldResult FoldCancellableTranspose(TransposeOp op) {
   if (!transpose) return {};
 
   // Permutations defined by constant operations.
-  auto perm0 = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
-  auto perm1 = dyn_cast_or_null<TF::ConstOp>(transpose.perm().getDefiningOp());
-  if (!perm0 || !perm1) return {};
+  DenseIntElementsAttr perm0;
+  DenseIntElementsAttr perm1;
+  if (!matchPattern(op.perm(), m_Constant(&perm0)) ||
+      !matchPattern(transpose.perm(), m_Constant(&perm1)))
+    return {};
 
   // With permutation indices that cancel each other
-  auto perm0_value = perm0.value().cast<DenseIntElementsAttr>();
-  auto perm1_value = perm1.value().cast<DenseIntElementsAttr>();
-  if (!AreCancellablePermutations(perm0_value, perm1_value)) return {};
+  if (!AreCancellablePermutations(perm0, perm1)) return {};
 
   return transpose.x();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index fc8e6f40f65..412bf113a0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -33,7 +33,7 @@ namespace TF {
 static inline LogicalResult VerifyRefTypeMatch(mlir::Type type,
                                                mlir::Type maybe_ref_type) {
   if (auto ref_type = maybe_ref_type.dyn_cast<mlir::TF::TensorFlowRefType>())
-    return success(ref_type.RemoveRef().getKind() == type.getKind());
+    return success(ref_type.RemoveRef().getTypeID() == type.getTypeID());
   return failure();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 994378ea1cf..2ec73824f6c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 
@@ -100,7 +101,7 @@ mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
         if (a == b) return a;
       }
     }
-    if (a.getKind() != b.getKind()) return nullptr;
+    if (a.getTypeID() != b.getTypeID()) return nullptr;
 
     // If either is not a type that contain subtypes then the types are not cast
     // compatible.
@@ -178,127 +179,116 @@ ResultShapeIterator::ResultShapeIterator(Operation::result_iterator it)
 // TF types helper functions
 //===----------------------------------------------------------------------===//
 
+bool TensorFlowType::classof(Type type) {
+  return type.getDialect().getNamespace() == "tf";
+}
+bool TensorFlowRefType::classof(Type type) {
+  return type.isa<
+#define HANDLE_TF_TYPE(tftype, enumerant, name)
+#define HANDLE_TF_REF_TYPE(tftype, enumerant, name) tftype##Type,
+#define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
+// NOLINTNEXTLINE
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
+      >();
+}
+bool TensorFlowTypeWithSubtype::classof(Type type) {
+  return type.isa<ResourceType, VariantType>();
+}
+
 TensorFlowType TensorFlowRefType::get(Type type) {
   MLIRContext* ctx = type.getContext();
-  switch (getElementTypeOrSelf(type).getKind()) {
-    case StandardTypes::F16:
-      return HalfRefType::get(ctx);
-    case StandardTypes::F32:
-      return FloatRefType::get(ctx);
-    case StandardTypes::F64:
-      return DoubleRefType::get(ctx);
-    case StandardTypes::BF16:
-      return Bfloat16RefType::get(ctx);
-    case StandardTypes::Complex: {
-      const auto& etype = type.cast<ComplexType>().getElementType();
-      switch (getElementTypeOrSelf(etype).getKind()) {
-        case StandardTypes::F32:
-          return Complex64RefType::get(ctx);
-        case StandardTypes::F64:
-          return Complex128RefType::get(ctx);
-        default:
-          llvm_unreachable("unexpected complex type");
-      }
+  type = getElementTypeOrSelf(type);
+  if (type.isF16()) {
+    return HalfRefType::get(ctx);
+  } else if (type.isF32()) {
+    return FloatRefType::get(ctx);
+  } else if (type.isF64()) {
+    return DoubleRefType::get(ctx);
+  } else if (type.isBF16()) {
+    return Bfloat16RefType::get(ctx);
+  } else if (auto complex_type = type.dyn_cast<ComplexType>()) {
+    Type etype = complex_type.getElementType();
+    if (etype.isF32()) {
+      return Complex64RefType::get(ctx);
+    } else if (etype.isF64()) {
+      return Complex128RefType::get(ctx);
     }
-    case StandardTypes::Integer: {
-      const auto& itype = type.cast<IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          return BoolRefType::get(ctx);
-        case 8:
-          return itype.isUnsigned() ? TensorFlowType(Uint8RefType::get(ctx))
-                                    : Int8RefType::get(ctx);
-        case 16:
-          return itype.isUnsigned() ? TensorFlowType(Uint16RefType::get(ctx))
-                                    : Int16RefType::get(ctx);
-        case 32:
-          return itype.isUnsigned() ? TensorFlowType(Uint32RefType::get(ctx))
-                                    : Int32RefType::get(ctx);
-        case 64:
-          return itype.isUnsigned() ? TensorFlowType(Uint64RefType::get(ctx))
-                                    : Int64RefType::get(ctx);
-        default:
-          llvm_unreachable("unexpected integer type");
-      }
+    llvm_unreachable("unexpected complex type");
+  } else if (auto itype = type.dyn_cast<IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        return BoolRefType::get(ctx);
+      case 8:
+        return itype.isUnsigned() ? TensorFlowType(Uint8RefType::get(ctx))
+                                  : Int8RefType::get(ctx);
+      case 16:
+        return itype.isUnsigned() ? TensorFlowType(Uint16RefType::get(ctx))
+                                  : Int16RefType::get(ctx);
+      case 32:
+        return itype.isUnsigned() ? TensorFlowType(Uint32RefType::get(ctx))
+                                  : Int32RefType::get(ctx);
+      case 64:
+        return itype.isUnsigned() ? TensorFlowType(Uint64RefType::get(ctx))
+                                  : Int64RefType::get(ctx);
+      default:
+        llvm_unreachable("unexpected integer type");
     }
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant:              \
+  }
+#define HANDLE_TF_TYPE(tftype, enumerant, name)        \
+  if (auto derived_ty = type.dyn_cast<tftype##Type>()) \
     return tftype##RefType::get(ctx);
 
 #define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    default:
-      llvm_unreachable("unexpected type kind");
-  }
+  llvm_unreachable("unexpected type kind");
 }
 
 Type TensorFlowRefType::RemoveRef() {
   MLIRContext* ctx = getContext();
-  switch (getKind()) {
-    case TensorFlowTypes::HALF_REF:
-      return mlir::FloatType::getF16(ctx);
-    case TensorFlowTypes::FLOAT_REF:
-      return mlir::FloatType::getF32(ctx);
-    case TensorFlowTypes::DOUBLE_REF:
-      return mlir::FloatType::getF64(ctx);
-    case TensorFlowTypes::BFLOAT16_REF:
-      return mlir::FloatType::getBF16(ctx);
-    case TensorFlowTypes::BOOL_REF:
-      return mlir::IntegerType::get(1, ctx);
-    case TensorFlowTypes::INT8_REF:
-      return mlir::IntegerType::get(8, ctx);
-    case TensorFlowTypes::INT16_REF:
-      return mlir::IntegerType::get(16, ctx);
-    case TensorFlowTypes::INT32_REF:
-      return mlir::IntegerType::get(32, ctx);
-    case TensorFlowTypes::INT64_REF:
-      return mlir::IntegerType::get(64, ctx);
-    case TensorFlowTypes::UINT8_REF:
-      return mlir::IntegerType::get(8, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::UINT16_REF:
-      return mlir::IntegerType::get(16, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::UINT32_REF:
-      return mlir::IntegerType::get(32, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::UINT64_REF:
-      return mlir::IntegerType::get(64, IntegerType::Unsigned, ctx);
-    case TensorFlowTypes::COMPLEX64_REF:
-      return mlir::ComplexType::get(mlir::FloatType::getF32(ctx));
-    case TensorFlowTypes::COMPLEX128_REF:
-      return mlir::ComplexType::get(mlir::FloatType::getF64(ctx));
+  if (isa<HalfRefType>()) return mlir::FloatType::getF16(ctx);
+  if (isa<FloatRefType>()) return mlir::FloatType::getF32(ctx);
+  if (isa<DoubleRefType>()) return mlir::FloatType::getF64(ctx);
+  if (isa<Bfloat16RefType>()) return mlir::FloatType::getBF16(ctx);
+  if (isa<BoolRefType>()) return mlir::IntegerType::get(1, ctx);
+  if (isa<Int8RefType>()) return mlir::IntegerType::get(8, ctx);
+  if (isa<Int16RefType>()) return mlir::IntegerType::get(16, ctx);
+  if (isa<Int32RefType>()) return mlir::IntegerType::get(32, ctx);
+  if (isa<Int64RefType>()) return mlir::IntegerType::get(64, ctx);
+  if (isa<Uint8RefType>())
+    return mlir::IntegerType::get(8, IntegerType::Unsigned, ctx);
+  if (isa<Uint16RefType>())
+    return mlir::IntegerType::get(16, IntegerType::Unsigned, ctx);
+  if (isa<Uint32RefType>())
+    return mlir::IntegerType::get(32, IntegerType::Unsigned, ctx);
+  if (isa<Uint64RefType>())
+    return mlir::IntegerType::get(64, IntegerType::Unsigned, ctx);
+  if (isa<Complex64RefType>())
+    return mlir::ComplexType::get(mlir::FloatType::getF32(ctx));
+  if (isa<Complex128RefType>())
+    return mlir::ComplexType::get(mlir::FloatType::getF64(ctx));
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case TensorFlowTypes::enumerant##_REF:        \
-    return tftype##Type::get(ctx);
+  if (isa<tftype##RefType>()) return tftype##Type::get(ctx);
 
 #define HANDLE_TF_REF_TYPE(tftype, enumerant, name)
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    default:
-      llvm_unreachable("unexpected tensorflow ref type kind");
-  }
+  llvm_unreachable("unexpected tensorflow ref type kind");
 }
 
 Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
   MLIRContext* ctx = getContext();
-  switch (getKind()) {
-    case TensorFlowTypes::VARIANT:
-      return VariantType::get(ctx);
-    case TensorFlowTypes::RESOURCE:
-      return ResourceType::get(ctx);
-    default:
-      llvm_unreachable("unexpected tensorflow type with subtypes kind");
-  }
+  if (isa<VariantType>()) return VariantType::get(ctx);
+  if (isa<ResourceType>()) return ResourceType::get(ctx);
+  llvm_unreachable("unexpected tensorflow type with subtypes kind");
 }
 
 ArrayRef<TensorType> TensorFlowTypeWithSubtype::GetSubtypes() {
-  switch (getKind()) {
-    case TensorFlowTypes::VARIANT:
-      return this->cast<VariantType>().getSubtypes();
-    case TensorFlowTypes::RESOURCE:
-      return this->cast<ResourceType>().getSubtypes();
-    default:
-      llvm_unreachable("unexpected tensorflow type with subtypes kind");
-  }
+  if (auto variant_type = dyn_cast<VariantType>())
+    return variant_type.getSubtypes();
+  if (auto resource_type = dyn_cast<ResourceType>())
+    return resource_type.getSubtypes();
+  llvm_unreachable("unexpected tensorflow type with subtypes kind");
 }
 
 // TODO(jpienaar): BroadcastCompatible and HasCompatibleElementTypes have
@@ -306,8 +296,11 @@ ArrayRef<TensorType> TensorFlowTypeWithSubtype::GetSubtypes() {
 bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
   if (lhs.size() != rhs.size()) return false;
   for (auto types : llvm::zip(lhs, rhs)) {
-    auto lhs_type = std::get<0>(types);
-    auto rhs_type = std::get<1>(types);
+    // Drop ref types because they don't affect broadcast compatibility. E.g.,
+    // `tensor<!tf.f32ref>` and `tensor<f32>` should be considered broadcast
+    // compatible.
+    auto lhs_type = DropRefType(std::get<0>(types));
+    auto rhs_type = DropRefType(std::get<1>(types));
 
     // This should be true for all TF ops:
     auto lhs_tt = lhs_type.dyn_cast<TensorType>();
@@ -366,27 +359,31 @@ bool AreCastCompatible(ArrayRef<Type> types) {
   return true;
 }
 
-ShapedType DropTypeSubTypes(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  auto subtype_ty = element_ty.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!subtype_ty) return ty;
+// Assumes a function `GetDefaultTypeOf(ComposedType)` that returns the default
+// type for a composed type (such as a ref type or a type with subtypes).
+template <typename ComposedType>
+Type DropTypeHelper(Type ty) {
+  Type element_ty = getElementTypeOrSelf(ty);
+  auto composed_type = element_ty.dyn_cast<ComposedType>();
+  if (!composed_type) return ty;
 
-  Type default_ty = GetDefaultTypeOf(subtype_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
+  Type default_ty = GetDefaultTypeOf(composed_type);
+  if (auto ranked_ty = ty.dyn_cast<RankedTensorType>()) {
+    return RankedTensorType::get(ranked_ty.getShape(), default_ty);
+  } else if (ty.dyn_cast<UnrankedTensorType>()) {
+    return UnrankedTensorType::get(default_ty);
+  } else {
+    return default_ty;
+  }
 }
 
-ShapedType DropRefType(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  TF::TensorFlowRefType ref_ty = element_ty.dyn_cast<TF::TensorFlowRefType>();
-  if (!ref_ty) return ty;
-
-  Type default_ty = TF::GetDefaultTypeOf(ref_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
+Type DropSubTypes(Type ty) {
+  return DropTypeHelper<TF::TensorFlowTypeWithSubtype>(ty);
 }
 
+Type DropRefType(Type ty) { return DropTypeHelper<TF::TensorFlowRefType>(ty); }
+
+Type DropRefAndSubTypes(Type ty) { return DropRefType(DropSubTypes(ty)); }
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index 43d5f2fa476..f93f6b657da 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -67,26 +67,13 @@ using ResultShapeRange = iterator_range<ResultShapeIterator>;
 // TensorFlow types
 //===----------------------------------------------------------------------===//
 
-namespace TensorFlowTypes {
-// List of supported TensorFlowType kinds, necessary for isa/dyn_cast.
-enum Kind {
-  FIRST_USED_TENSORFLOW_TYPE = Type::FIRST_TENSORFLOW_TYPE,
-#define HANDLE_TF_TYPE(tftype, enumerant, name) enumerant,
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-  LAST_USED_TENSORFLOW_TYPE,
-};
-}  // namespace TensorFlowTypes
-
 // The base class in the TensorFlow type hierarchy.
 class TensorFlowType : public Type {
  public:
   using Type::Type;
 
   // Support method to enable LLVM-style type casting.
-  static bool classof(Type type) {
-    return type.getKind() >= Type::FIRST_TENSORFLOW_TYPE &&
-           type.getKind() <= TensorFlowTypes::LAST_USED_TENSORFLOW_TYPE;
-  }
+  static bool classof(Type type);
 };
 
 // Returns true if the specified type is a valid TensorFlow element type.
@@ -105,10 +92,7 @@ static inline bool IsValidTFTensorType(Type type) {
 
 namespace detail {
 // Common implementation of TensorFlow types. The template argument indicates
-// the concrete derived class per CRTP. Concrete classes must implement the
-// following:
-//   - `static unsigned getTypeKind()` that returns the (fixed) kind of the
-//     type.
+// the concrete derived class per CRTP.
 template <typename Derived>
 class TensorFlowTypeImpl
     : public Type::TypeBase<Derived, TensorFlowType, TypeStorage> {
@@ -116,11 +100,6 @@ class TensorFlowTypeImpl
   using Base = typename Type::TypeBase<Derived, TensorFlowType, TypeStorage>;
   using TFBase = TensorFlowTypeImpl<Derived>;
   using Base::Base;
-
-  // Get the unique'ed type in the given context.
-  static Derived get(MLIRContext* context) {
-    return Base::get(context, Derived::getTypeKind());
-  }
 };
 }  // namespace detail
 
@@ -130,10 +109,7 @@ class TensorFlowRefType : public TensorFlowType {
   using TensorFlowType::TensorFlowType;
 
   // Checks if a type is TensorFlow Ref type.
-  static bool classof(Type type) {
-    return type.getKind() >= TensorFlowTypes::FLOAT_REF &&
-           type.getKind() <= TensorFlowTypes::LAST_USED_TENSORFLOW_TYPE;
-  }
+  static bool classof(Type type);
 
   // Converts a type to the corresponding TensorFlowRef type.
   static TensorFlowType get(Type type);
@@ -179,7 +155,6 @@ static inline Type GetElementTypeOrSelfResolveRef(Type type) {
   class tftype##Type : public detail::TensorFlowTypeImpl<tftype##Type> { \
    public:                                                               \
     using TFBase::TFBase;                                                \
-    static unsigned getTypeKind() { return TensorFlowTypes::enumerant; } \
   };
 
 // Custom TensorFlow types are defined separately.
@@ -217,8 +192,6 @@ class TypeWithSubtypeStorage : public TypeStorage {
 // opaque and their interpretation depends on the actual underlying type.
 // The template argument indicates the concrete derived class per CRTP. Concrete
 // classes must implement the following:
-//   - `static unsigned getTypeKind()` that returns the (fixed) kind of the
-//     type.
 //   - `static std::string getTypeName()` that returns the name of the type for
 //     verification logging.
 template <typename Derived>
@@ -230,12 +203,12 @@ class TypeWithSubtypeImpl
   using Base::Base;
 
   static Derived get(ArrayRef<TensorType> subtypes, MLIRContext* context) {
-    return Base::get(context, Derived::getTypeKind(), subtypes);
+    return Base::get(context, subtypes);
   }
 
   static Derived getChecked(ArrayRef<TensorType> subtypes, MLIRContext* context,
                             Location loc) {
-    return Base::getChecked(loc, Derived::getTypeKind(), subtypes);
+    return Base::getChecked(loc, subtypes);
   }
 
   static Derived get(MLIRContext* context) { return get({}, context); }
@@ -263,10 +236,7 @@ class TensorFlowTypeWithSubtype : public TensorFlowType {
   using TensorFlowType::TensorFlowType;
 
   // Checks if a type is TensorFlow type with subtypes.
-  static bool classof(Type type) {
-    return type.getKind() == TensorFlowTypes::VARIANT ||
-           type.getKind() == TensorFlowTypes::RESOURCE;
-  }
+  static bool classof(Type type);
 
   // Converts a TypeWithSubtype type to the same type but without its subtypes.
   Type RemoveSubtypes();
@@ -288,7 +258,6 @@ static inline Type GetDefaultTypeOf(TensorFlowTypeWithSubtype type) {
 class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
  public:
   using TFBase::TFBase;
-  static unsigned getTypeKind() { return TensorFlowTypes::RESOURCE; }
   static std::string getTypeName() { return "ResourceType"; }
 };
 
@@ -300,7 +269,6 @@ class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
 class VariantType : public detail::TypeWithSubtypeImpl<VariantType> {
  public:
   using TFBase::TFBase;
-  static unsigned getTypeKind() { return TensorFlowTypes::VARIANT; }
   static std::string getTypeName() { return "VariantType"; }
 };
 
@@ -325,15 +293,21 @@ bool HasCompatibleElementTypes(Type lhs, Type rhs,
 // compatible.
 bool AreCastCompatible(ArrayRef<Type> types);
 
-// If the given tensor has elements of type with subtypes, then returns a new
-// type after dropping subtypes info. Otherwise, returns the original type as
-// is.
-ShapedType DropTypeSubTypes(ShapedType ty);
+// If `ty` is a tensor type and its element type has subtypes, then returns a
+// new type of same shape but dropped subtypes for the element type.
+// Otherwise, if `ty` has subtypes, then returns corresponding type with dropped
+// subtypes.
+// Otherwise, returns the original type `ty`.
+Type DropSubTypes(Type ty);
 
-// If the given tensor has elements of type ref, then returns a new type
-// of the shape, but corresponding non-ref type as element type. Otherwise,
-// returns the original type as is.
-ShapedType DropRefType(ShapedType ty);
+// If `ty` is a tensor type and has elements of a ref type, then returns a new
+// type of same shape but corresponding non-ref type as element type.
+// Otherwise, if `ty` is a ref type, then returns corresponding non-ref type.
+// Otherwise, returns the original type `ty`.
+Type DropRefType(Type ty);
+
+// Convenience call for executing both `DropRefType` and `DropSubTypes`.
+Type DropRefAndSubTypes(Type ty);
 
 }  // end namespace TF
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 595bdce5be4..50486909694 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -444,6 +444,14 @@ func @testReshapeNoOp(%arg0: tensor<2x4xf32>, %arg1: tensor<2xi32>) -> tensor<2x
   return %0 : tensor<2x4xf32>
 }
 
+// CHECK-LABEL: func @testBroadcastToNoOp
+func @testBroadcastToNoOp(%arg0: tensor<2x4xf32>, %arg1: tensor<2xi32>) -> tensor<2x4xf32> {
+  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<2x4xf32>, tensor<2xi32>) -> tensor<2x4xf32>
+
+  // CHECK: return %arg0
+  return %0 : tensor<2x4xf32>
+}
+
 // CHECK-LABEL: func @testPackShapeComputation
 func @testPackShapeComputation(%arg0: tensor<?x1xf32>, %arg1: tensor<?x1x2xf32>, %arg2: tensor<*xf32>) -> (tensor<2xi32>, tensor<3xi32>, tensor<3xi32>,  tensor<3xi32>, tensor<3xi32>, tensor<*xi32>) {
   // Test dimensions sizes.
@@ -620,6 +628,15 @@ func @testLogicalNotOfLessEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>
 // CHECK: return %0
 }
 
+// CHECK-LABEL: testSizeFolding
+func @testSizeFolding(%arg0: tensor<3x5x7xf32>) -> tensor<i32> {
+  %0 = "tf.Size"(%arg0) : (tensor<3x5x7xf32>) -> tensor<i32>
+  return %0: tensor<i32>
+
+// CHECK: %0 = "tf.Const"() {value = dense<105> : tensor<i32>} : () -> tensor<i32>
+// CHECK: return %0 : tensor<i32>
+}
+
 // CHECK-LABEL: testDivWithSqrtDivisor
 func @testDivWithSqrtDivisor(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Sqrt"(%arg1) : (tensor<8x16xf32>) -> tensor<8x16xf32>
@@ -685,6 +702,15 @@ func @identityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x5x6xf32> {
   // CHECK: return %arg0
 }
 
+// CHECK-LABEL: @identityTransposeConst
+func @identityTransposeConst(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x5x6xf32> {
+  %0 = constant dense<[0, 1, 2, 3, 4]> : tensor<5xi32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<2x3x4x5x6xf32>, tensor<5xi32>) -> tensor<2x3x4x5x6xf32>
+
+  return %1 : tensor<2x3x4x5x6xf32>
+  // CHECK: return %arg0
+}
+
 // CHECK-LABEL: @nonIdentityTranspose
 func @nonIdentityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x6x5xf32> {
   %0 = "tf.Const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
@@ -707,6 +733,17 @@ func @cancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
   // CHECK: return %arg0
 }
 
+// CHECK-LABEL: @cancellableTransposeConst
+func @cancellableTransposeConst(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
+  %0 = constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  %1 = constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  %2 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Transpose"(%2, %1) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+
+  return %3 : tensor<1x4x4x8xf32>
+  // CHECK: return %arg0
+}
+
 // CHECK-LABEL: @nonCancellableTranspose
 func @nonCancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<4x1x4x8xf32> {
   %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
@@ -725,13 +762,72 @@ func @addN(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
-// CHECK-LABEL: func @ToBool_0DScalar
-func @ToBool_0DScalar(%arg0: tensor<i1>) -> tensor<i1> {
+// CHECK-LABEL: func @ToBool_0DScalarI1
+func @ToBool_0DScalarI1(%arg0: tensor<i1>) -> tensor<i1> {
   // CHECK: return %arg0
   %0 = "tf.ToBool"(%arg0) : (tensor<i1>) -> tensor<i1>
   return %0 : tensor<i1>
 }
 
+// CHECK-LABEL: func @ToBool_0DScalarInt
+func @ToBool_0DScalarInt(%arg0: tensor<i32>) -> tensor<i1> {
+  // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]])
+  // CHECK: return [[NE]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_0DScalarFloat
+func @ToBool_0DScalarFloat(%arg0: tensor<f32>) -> tensor<i1> {
+  // CHECK: [[Zero:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[Zero]])
+  // CHECK: return [[NE]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<f32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_0DScalarString
+func @ToBool_0DScalarString(%arg0: tensor<!tf.string>) -> tensor<i1> {
+  // CHECK: [[EmptyStr:%.*]] = "tf.Const"() {value = dense<""> : tensor<!tf.string>} : () -> tensor<!tf.string>
+  // CHECK: [[NE:%.*]] = "tf.NotEqual"(%arg0, [[EmptyStr]]) {incompatible_shape_error = false} : (tensor<!tf.string>, tensor<!tf.string>) -> tensor<i1>
+  // CHECK: return [[NE]] : tensor<i1>
+  %0 = "tf.ToBool"(%arg0) : (tensor<!tf.string>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_1DTensor
+func @ToBool_1DTensor(%arg0: tensor<1xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<1xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_1DTensorZeroDim
+func @ToBool_1DTensorZeroDim(%arg0: tensor<0xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<0xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_2DTensor
+func @ToBool_2DTensor(%arg0: tensor<1x5xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<1x5xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @ToBool_2DTensorZeroDim
+func @ToBool_2DTensorZeroDim(%arg0: tensor<1x0xf32>) -> tensor<i1> {
+  // CHECK: [[Const:%.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: return [[Const]]
+  %0 = "tf.ToBool"(%arg0) : (tensor<1x0xf32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
 // CHECK-LABEL: testReadVariableOpOfCast
 func @testReadVariableOpOfCast(%arg0: tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32> {
   %0 = "tf.Cast"(%arg0) : (tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<*x!tf.resource>
@@ -826,6 +922,51 @@ func @foldIf(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> (tens
   return %4 : tensor<f32>
 }
 
+// CHECK-LABEL: foldIfRegion
+func @foldIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> (tensor<f32>, tensor<f32>) {
+  %false = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %true = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+
+  // CHECK: [[Val0:%.*]] = "tf.Mul"(%arg0, %arg1)
+  %0 = "tf.IfRegion"(%true) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%true_value) : (tensor<f32>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%false_value) : (tensor<f32>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> tensor<f32>
+
+  // CHECK: [[Val1:%.*]] = "tf.Sub"(%arg0, %arg1)
+  %1 = "tf.IfRegion"(%false) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%true_value) : (tensor<f32>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%false_value) : (tensor<f32>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> tensor<f32>
+
+  // CHECK: return [[Val0]], [[Val1]]
+  return %0, %1 : tensor<f32>, tensor<f32>
+}
+
+// CHECK-LABEL: foldIfRegionMismatchedTypes
+func @foldIfRegionMismatchedTypes(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<i1>) -> tensor<1xf32> {
+  %false = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %true = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+
+  // CHECK: [[Val0:%.*]] = "tf.Mul"(%arg0, %arg1)
+  // CHECK-NEXT: [[Cast:%.*]] = "tf.Cast"([[Val0]])
+  // CHECK-NEXT: return [[Cast]]
+  %0 = "tf.IfRegion"(%true) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+      "tf.Yield"(%true_value) : (tensor<?xf32>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+      "tf.Yield"(%false_value) : (tensor<?xf32>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
 // CHECK-LABEL: foldCase
 func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %2 = constant dense<1> : tensor<i32>
@@ -834,11 +975,11 @@ func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   // CHECK: PartitionedCall
   // CHECK-SAME: device = "noodle"
   // CHECK-SAME: f = @add
-  %4 = "tf.Case"(%2, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], device = "noodle"} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %4 = "tf.Case"(%2, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], device = "noodle", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK: PartitionedCall
   // CHECK-SAME: _cluster_launch = "not_ready"
   // CHECK-SAME: f = @sub
-  %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], _cluster_launch = "not_ready"} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %5 = "tf.Case"(%3, %4, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], _cluster_launch = "not_ready", is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   return %5 : tensor<f32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/device_copy.mlir b/tensorflow/compiler/mlir/tensorflow/tests/device_copy.mlir
new file mode 100644
index 00000000000..8250bcf7101
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/device_copy.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-opt -tf-tensor-device-copy %s | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @fold_identity
+// CHECK-SAME: ([[arg0:%.*]]: tensor<2x2xf32>, [[arg1:%.*]]: tensor<2x2xf32>
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32}} {
+  func @fold_identity(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = tf_executor.graph {
+      // CHECK: tf.MatMul
+      %outputs, %control = tf_executor.island wraps "tf.MatMul"(%arg0, %arg1) {device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+      // CHECK-NOT: tf.Identity
+      %outputs_0, %control_1 = tf_executor.island wraps "tf.Identity"(%outputs) {device = ""} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+      tf_executor.fetch %outputs_0 : tensor<2x2xf32>
+    }
+    return %0 : tensor<2x2xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
index 7d761b5d690..0000d43823b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
@@ -16,7 +16,7 @@ module {
        "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
         %index = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
         %input = "tf.opB"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
-        %result = "tf.Case"(%index, %input) {branches = [@branch_0, @branch_1, @branch_2, @branch_3, @branch_4]} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %result = "tf.Case"(%index, %input) {branches = [@branch_0, @branch_1, @branch_2, @branch_3, @branch_4], is_stateless = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
         tf_executor.yield %result : tensor<i32>
       }
       tf_executor.fetch %output : tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
index c8c82c5c08f..e4e7f0859c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
@@ -123,6 +123,27 @@ func @testIfNoInputAndNoResult(%arg0: tensor<i1>) -> () {
 
 // -----
 
+// If with non tensor<i1> condition
+
+// Simple If
+// CHECK: func @testIf1Then{{.+}}
+// CHECK: func @testIf1Else{{.+}}
+func @testIf1Then(tensor<*xf32>) -> tensor<*xf32>
+func @testIf1Else(tensor<*xf32>) -> tensor<*xf32>
+
+// CHECK-LABEL: func @testIf1Result(%arg0: tensor<i32>, %arg1: tensor<*xf32>)
+func @testIf1Result(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false
+  } : (tensor<i32>, tensor<*xf32>) -> tensor<*xf32>
+
+  // CHECK: [[ToBool:%.*]] = "tf.ToBool"
+  // CHECK: "tf.IfRegion"([[ToBool]])
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
 // Simple While
 func @testWhileCond(tensor<*xf32>) -> (tensor<i1>)
 func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
@@ -200,3 +221,58 @@ func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
   return %1 : tensor<*xf32>
 }
 
+// -----
+
+// While with non tensor<i1> condition
+func @testWhileCond(tensor<*xf32>) -> (tensor<f32>)
+func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
+
+// CHECK-LABEL: func @testWhileResult
+func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
+^bb0(%arg0: tensor<*xf32>):
+  %1 = "tf.While"(%arg0) {
+    cond = @testWhileCond,
+    body = @testWhileBody,
+    is_stateless = true,
+    _attr0 = 10, _attr1 = true, attr2 = "hello"
+  } : (tensor<*xf32>) -> (tensor<*xf32>)
+
+  // CHECK: [[Result0:%.*]] = "tf.WhileRegion"
+  // CHECK: [[Result1:%.*]] = call @testWhileCond
+  // CHECK: [[ToBool:%.*]] = "tf.ToBool"([[Result1]])
+  // CHECK: "tf.Yield"([[ToBool]])
+  // CHECK: [[Result2:%.*]] = call @testWhileBody
+  // CHECK: "tf.Yield"([[Result2]])
+  // CHECK: return [[Result0]]
+  return %1 : tensor<*xf32>
+}
+
+// -----
+
+func @then_branch() -> ()
+func @else_branch() -> ()
+
+// Test tf.If device is preserved.
+// CHECK-LABEL: func @testIfDevice
+func @testIfDevice(%arg0: tensor<i1>) {
+  "tf.If"(%arg0) {then_branch = @then_branch, else_branch = @else_branch, is_stateless = false, device = "/device:CPU:0"} : (tensor<i1>) -> ()
+
+  // CHECK: "tf.IfRegion"
+  // CHECK: device = "/device:CPU:0"
+  return
+}
+
+// -----
+
+func @cond() -> tensor<i1>
+func @body() -> ()
+
+// Test tf.While device is preserved.
+// CHECK-LABEL: func @testWhileDevice
+func @testWhileDevice() {
+  "tf.While"() {cond = @cond, body = @body, is_stateless = false, device = "/device:CPU:0"} : () -> ()
+
+  // CHECK: "tf.WhileRegion"
+  // CHECK: device = "/device:CPU:0"
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index e11474c0755..ea55e50db30 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -479,13 +479,39 @@ func @DynamicStitch_duplicates(%arg0: tensor<2x2xf32>) -> tensor<1x2xf32> {
   return %0 : tensor<1x2xf32>
 }
 
-func @Reciprocal(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK-LABEL: @Reciprocal_i32
+func @Reciprocal_i32(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
+// CHECK-LABEL: @Reciprocal_f32
+func @Reciprocal_f32(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
   // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
   %0 = "tf.Reciprocal"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: @Reciprocal_complexf32
+func @Reciprocal_complexf32(%arg0: tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+  // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<complex<f32>>, tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xcomplex<f32>>) -> tensor<*xcomplex<f32>>
+  return %0 : tensor<*xcomplex<f32>>
+}
+
+// CHECK-LABEL: @Reciprocal_complexf64
+func @Reciprocal_complexf64(%arg0: tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>> {
+  // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>} : () -> tensor<complex<f64>>
+  // CHECK: "tf.Div"(%[[ONE]], %arg0) : (tensor<complex<f64>>, tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>>
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xcomplex<f64>>) -> tensor<*xcomplex<f64>>
+  return %0 : tensor<*xcomplex<f64>>
+}
+
+// CHECK-LABEL: @ScatterNd
 func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
   // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<8xf32>} : () -> tensor<8xf32>
   // CHECK: "tf.TensorScatterUpdate"(%[[ZERO]], %arg0, %arg1) : (tensor<8xf32>, tensor<4x1xi32>, tensor<4xf32>) -> tensor<8xf32>
@@ -494,3 +520,16 @@ func @ScatterNd(%arg0: tensor<4x1xi32>, %arg1: tensor<4xf32>) -> tensor<8xf32> {
   %0 = "tf.ScatterNd"(%arg0, %arg1, %shape) : (tensor<4x1xi32>, tensor<4xf32>, tensor<1xi32>) -> tensor<8xf32>
   return %0 : tensor<8xf32>
 }
+
+// CHECK-LABEL: @_UnaryOpsComposition
+// CHECK-SAME: %[[ARG0:.*]]: tensor<4xf32>
+func @_UnaryOpsComposition(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+
+  // CHECK: %[[RESULT0:.*]] = "tf.Asin"(%[[ARG0]])
+  // CHECK: %[[RESULT1:.*]] = "tf.Abs"(%[[RESULT0]])
+  // CHECK: %[[RESULT2:.*]] = "tf.Log"(%[[RESULT1]])
+  // CHECK: return %[[RESULT2]]
+
+  %0 = "tf._UnaryOpsComposition"(%arg0) {op_names = ["Asin", "Abs", "Log"]} : (tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
index 9544a02dca4..df2add2208a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -136,6 +136,7 @@ func @if_region_captured_string(%arg0: tensor<i1>, %arg1: tensor<!tf.string>) ->
     // CHECK-NOT: _xla_outside_compilation
     // CHECK: "tf.IfRegion"
     // CHECK: "tf.StringToNumber"
+    // CHECK-NOT: _xla_outside_compilation
     // CHECK: _xla_outside_compilation = "auto", is_stateless = true
     %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %2 = "tf.IfRegion"(%arg0) ( {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir
index c6543f3121e..09a38b5b5de 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/func_list_attr.mlir
@@ -43,7 +43,7 @@ func @main() {
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
 // CHECK:      }
-    %1:2 = tf_executor.island wraps "tf.Case"(%0#0) {Tin = [], Tout = ["tfdtype$DT_FLOAT"], branches = [@foo, @bar], device = "", output_shapes = []} : (tensor<i32>) -> tensor<*xf32> loc("Case")
+    %1:2 = tf_executor.island wraps "tf.Case"(%0#0) {Tin = [], Tout = ["tfdtype$DT_FLOAT"], branches = [@foo, @bar], device = "", output_shapes = [], is_stateless = false} : (tensor<i32>) -> tensor<*xf32> loc("Case")
     tf_executor.fetch
   }
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
index e9d4e441a10..3e8935b699e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
@@ -212,6 +212,28 @@ func @testNoOutputs(%arg0: tensor<i1>, %arg1: tensor<*xf32>) -> () {
   return
 }
 
+// -----
+// Check ToBool folding for IfRegion
+// CHECK: func @tf.IfRegion_else(%arg0: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   "tf.Neg"
+// CHECK: func @tf.IfRegion_then(%arg0: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT:   "tf.Abs"
+// CHECK-LABEL: @testToBoolFold
+func @testToBoolFold(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK-NEXT: "tf.If"(%arg0, %arg1)
+  // CHECK-SAME: else_branch = @tf.IfRegion_else
+  // CHECK-SAME: then_branch = @tf.IfRegion_then
+  %tobool = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<i1>
+  %0 = "tf.IfRegion"(%tobool) ({
+    %1 = "tf.Abs"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+    }, {
+    %2 = "tf.Neg"(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
+    "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+    }) {is_stateless = true} :  (tensor<i1>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // -----
 
 // Simple WhileRegion
@@ -592,3 +614,64 @@ func @testWhileRegionBlockArgMismatch(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>
   // CHECK: return [[Result]]#0
   return %0#0 : tensor<*xf32>
 }
+
+// -----
+
+// Simple trivially transformable while with ToBool
+// CHECK: func @while_cond
+// CHECK: func @while_body
+// CHECK-LABEL: testWhileRegionTrivial
+func @while_cond(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<i32>
+func @while_body(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+func @testWhileRegionTrivial(%arg0 : tensor<*xf32>, %arg1 : tensor<i32>) -> tensor<*xf32> {
+  // CHECK: [[Result:%.*]]:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond
+  %0:2 = "tf.WhileRegion"(%arg0, %arg1) (
+    {
+      ^bb0(%carg0: tensor<*xf32>, %carg1: tensor<i32>):
+        %cond_i32 = call @while_cond(%carg0, %carg1) : (tensor<*xf32>, tensor<i32>) -> tensor<i32>
+        %cond = "tf.ToBool"(%cond_i32) : (tensor<i32>) -> tensor<i1>
+        "tf.Yield"(%cond) : (tensor<i1>) -> ()
+    },
+    {
+      // loop body
+      ^bb0(%barg0: tensor<*xf32>, %barg1: tensor<i32>):
+        %bdy:2 = call @while_body(%barg0, %barg1) : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+        "tf.Yield"(%bdy#0, %bdy#1) : (tensor<*xf32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = false } : (tensor<*xf32>, tensor<i32>) -> (tensor<*xf32>, tensor<i32>)
+  // CHECK: return [[Result]]#0
+  return %0#0 : tensor<*xf32>
+}
+
+// -----
+
+// Test tf.IfRegion device is preserved.
+// CHECK-LABEL: func @testIfRegionDevice
+func @testIfRegionDevice(%arg0: tensor<i1>) {
+  "tf.IfRegion"(%arg0) ({
+    "tf.Yield"() : () -> ()
+  }, {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false, device = "/device:CPU:0"} : (tensor<i1>) -> ()
+
+  // CHECK: "tf.If"
+  // CHECK-SAME: device = "/device:CPU:0"
+  return
+}
+
+// -----
+
+// Test tf.WhileRegion device is preserved.
+// CHECK-LABEL: func @testWhileRegionDevice
+func @testWhileRegionDevice() {
+  "tf.WhileRegion"() ( {
+    %0 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+    "tf.Yield"(%0) : (tensor<i1>) -> ()
+  }, {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false, device = "/device:CPU:0"} : () -> ()
+
+  // CHECK: "tf.While"
+  // CHECK-SAME: device = "/device:CPU:0"
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
index 87da399b726..da0a2df9e6a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
@@ -173,7 +173,7 @@ func @passthru(%arg0: !tf_res) -> (!tf_res, !tf_res) {
 // -----
 // Test aliasing through IfRegion
 
-!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+!tf_res = type tensor<*x!tf.resource<tensor<i1>>>
 
 // CHECK-LABEL: func @if_region_aliasing
 // expected-remark@below {{Region #0, Arg #0, ID 7 : 1, 4, 6, 7}}
@@ -181,7 +181,7 @@ func @passthru(%arg0: !tf_res) -> (!tf_res, !tf_res) {
 func @if_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
   // expected-remark@below {{Result #0, ID 0 : 0, 1, 3, 4, 5}}
   %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
-  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<32xf32>
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<i1>
   // expected-remark@below {{Result #0, ID 4 : Unknown}}
   // expected-remark@below {{Result #1, ID 5 : 0, 1, 2, 3, 4, 5, 6, 8}}
   // expected-remark@below {{Result #2, ID 6 : 1, 2, 4, 5, 6, 7, 8}}
@@ -195,7 +195,7 @@ func @if_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
             // expected-remark@below {{Result #0, ID 3 : 0, 1, 3, 4, 5}}
             %id0 = "tf.Identity"(%vh0) : (!tf_res) -> !tf_res
             "tf.Yield"(%id0, %id0, %arg0) : (!tf_res, !tf_res, !tf_res) -> ()
-          }) {is_stateless = true} : (tensor<32xf32>) -> (!tf_res, !tf_res, !tf_res)
+          }) {is_stateless = true} : (tensor<i1>) -> (!tf_res, !tf_res, !tf_res)
   return
 }
 
@@ -232,3 +232,55 @@ func @while_region_aliasing(%arg0: !tf_res, %arg1: !tf_res, %arg2: !tf_res) {
   return
 }
 
+// -----
+// Test aliasing through calls
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @aliasing_through_calls
+func @aliasing_through_calls(%arg0: tensor<32xf32>) -> () {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 2}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 0, 1, 2}}
+  %c:2 = call @passthru(%vh0) : (!tf_res) -> (!tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 1}}
+func @passthru(%arg0: !tf_res) -> (!tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  return %vh0, %arg0 : !tf_res, !tf_res
+}
+
+// -----
+// Test aliasing through tf_device.launch
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @aliasing_through_launch
+func @aliasing_through_launch(%arg0: tensor<32xf32>) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %vh = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> !tf_res
+
+  // expected-remark@below {{Result #0, ID 1 : 0, 1}}
+  %launch = "tf_device.launch"() ({
+    tf_device.return %vh : !tf_res
+  }) {device = ""} : () -> !tf_res
+  return
+}
+
+// -----
+// Test aliasing through tf_device.cluster
+!tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
+
+// CHECK-LABEL: func @aliasing_through_cluster
+func @aliasing_through_cluster(%arg0: tensor<32xf32>) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %vh = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> !tf_res
+
+  // expected-remark@below {{Result #0, ID 1 : 0, 1}}
+  %cluster = "tf_device.cluster"() ({
+    tf_device.return %vh : !tf_res
+  }) : () -> !tf_res
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
index dd622e565c0..75cafde88e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
@@ -424,3 +424,117 @@ func @propagate_if_region_inlined(
   }
   return
 }
+
+// Test propagation through WhileRegion (inlined calls)
+// CHECK-LABEL: func @propagate_while_region_inlined
+func @propagate_while_region_inlined(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: tensor<i32>) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} : () -> !tf_res
+      // CHECK-NEXT: "tf.WhileRegion"
+      "tf.WhileRegion"(%arg1, %id0, %var_handle) ({
+          ^bb0(%carg0: tensor<i32>, %carg1: !tf_res, %carg2: !tf_res):
+            // CHECK: ^bb
+            // CHECK: "tf.Identity"
+            // CHECK-SAME: {device = "/TPU:0"}
+            %cid0 = "tf.Identity"(%carg1) : (!tf_res) -> !tf_res loc("cid0")
+            %read = "tf.ReadVariableOp"(%cid0) : (!tf_res) -> tensor<32xf32>
+            %cst = constant dense<3.0> : tensor<32xf32>
+            %cmp = "tf.Less"(%read, %cst) : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xi1>
+            %dims = constant dense<0> : tensor<1xi32>
+            %reduce = "tf.All"(%cmp, %dims) {keep_dims = false} : (tensor<32xi1>, tensor<1xi32>) -> tensor<i1>
+            "tf.Yield"(%reduce) : (tensor<i1>) -> ()
+        }, {
+          ^bb0(%barg0: tensor<i32>, %barg1: !tf_res, %barg2: !tf_res):
+            // CHECK: ^bb
+            // CHECK: "tf.Identity"
+            // CHECK-SAME: {device = "/TPU:0"}
+            %bid0 = "tf.Identity"(%barg1) : (!tf_res) -> !tf_res
+            // CHECK-NEXT: "tf.Identity"
+            // CHECK-SAME: {device = "/TPU:1"}
+            %id1 = "tf.Identity"(%barg2) : (!tf_res) -> !tf_res
+            "tf.Yield"(%barg0, %bid0, %id1) : (tensor<i32>, !tf_res,!tf_res) -> ()
+        }){is_stateless = false}
+        : (tensor<i32>, !tf_res, !tf_res) -> (tensor<i32>, !tf_res, !tf_res)
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// Test propagation through WhileRegion (non-inlined calls)
+// CHECK-LABEL: func @propagate_while_region
+func @propagate_while_region(
+  %arg0: !tf_res {tf.device = "/TPU:0"},
+  %arg1: tensor<i32>) {
+  tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+      // CHECK-NEXT: "tf.VarHandleOp"
+      %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} : () -> !tf_res
+      // CHECK-NEXT: "tf.WhileRegion"
+      "tf.WhileRegion"(%arg1, %id0, %var_handle) ({
+          ^bb0(%carg0: tensor<i32>, %carg1: !tf_res, %carg2: !tf_res):
+            %cond = call @whileregion_cond(%carg0, %carg1, %carg2) : (tensor<i32>, !tf_res, !tf_res) -> tensor<i1>
+            "tf.Yield"(%cond) : (tensor<i1>) -> ()
+        }, {
+          ^bb0(%barg0: tensor<i32>, %barg1: !tf_res, %barg2: !tf_res):
+            %new_values:3 = call @whileregion_body(%barg0, %barg1, %barg2) : (tensor<i32>, !tf_res,!tf_res) -> (tensor<i32>, !tf_res,!tf_res)
+            "tf.Yield"(%new_values#0, %new_values#1, %new_values#2) : (tensor<i32>, !tf_res,!tf_res) -> ()
+        }){is_stateless = false}
+        : (tensor<i32>, !tf_res, !tf_res) -> (tensor<i32>, !tf_res, !tf_res)
+      tf_executor.yield
+    }
+    tf_executor.fetch %island : !tf_executor.control
+  }
+  return
+}
+
+// CHECK-LABEL: func @whileregion_body
+func @whileregion_body(%arg0: tensor<i32>, %arg1: !tf_res, %arg2: !tf_res) -> (tensor<i32>, !tf_res, !tf_res) {
+  %graph:3 = tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island:4 = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:1"}
+      %id1 = "tf.Identity"(%arg2) : (!tf_res) -> !tf_res
+      tf_executor.yield %arg0, %id0, %id1 : tensor<i32>, !tf_res, !tf_res
+    }
+    tf_executor.fetch %island#0, %island#1, %island#2 : tensor<i32>, !tf_res, !tf_res
+  }
+  return %graph#0, %graph#1, %graph#2: tensor<i32>, !tf_res, !tf_res
+}
+
+// CHECK-LABEL: func @whileregion_cond
+func @whileregion_cond(%arg0: tensor<i32>, %arg1: !tf_res, %arg2: !tf_res) -> tensor<i1> {
+  %graph = tf_executor.graph {
+    // CHECK: tf_executor.island
+    %island:2 = tf_executor.island {
+      // CHECK-NEXT: "tf.Identity"
+      // CHECK-SAME: {device = "/TPU:0"}
+      %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+      %read = "tf.ReadVariableOp"(%id0) : (!tf_res) -> tensor<32xf32>
+      %cst = constant dense<3.0> : tensor<32xf32>
+      %cmp = "tf.Less"(%read, %cst) : (tensor<32xf32>, tensor<32xf32>) -> tensor<32xi1>
+      %dims = constant dense<0> : tensor<1xi32>
+      %reduce = "tf.All"(%cmp, %dims) {keep_dims = false} : (tensor<32xi1>, tensor<1xi32>) -> tensor<i1>
+      tf_executor.yield %reduce : tensor<i1>
+    }
+    tf_executor.fetch %island#0 : tensor<i1>
+  }
+  return %graph : tensor<i1>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index ac5c2df8f7e..213ca402f56 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -112,26 +112,6 @@ func @internal_resource() -> tensor<*xi32> {
 
 // -----
 
-// Tests that pass fails when there are remaining resource operationss that can
-// not be lifted.
-
-func @lifting_failure() -> tensor<*xi32> {
-
-  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
-
-  // expected-error @+1 {{has remaining resource inputs that can not be lifted}}
-  %1 = "tf_device.cluster"() ( {
-    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
-		%3 = "tf.SomeResourceOp"(%0, %2) : (tensor<*x!tf.resource>, tensor<*xi32>) -> tensor<*xi32>
-    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> ()
-    tf_device.return %3 : tensor<*xi32>
-  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
-
-  return %1 : tensor<*xi32>
-}
-
-// -----
-
 // Tests that pass lifts resource reads/writes from a loop, and removed unused
 // resources.
 
@@ -347,30 +327,6 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // -----
 
-// Tests that pass reports error on unsupported ops in loop body.
-
-func @cluster_with_loop() -> () {
-  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  "tf_device.cluster"() ( {
-    %1 = "tf.While"(%0) {
-      body = @while_body, cond = @while_cond, device = "", is_stateless = false}
-         : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
-    tf_device.return
-  }) {cluster_attr = "cluster_attr"} : () -> ()
-  return
-}
-func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
-  // expected-error @+1 {{found unsupported operations on resource.}}
-  "tf._UnknownOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> ()
-  return %arg0 : tensor<*x!tf.resource<tensor<f32>>>
-}
-func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
-  %read = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
-  return %read : tensor<f32>
-}
-
-// -----
-
 // Tests that pass reports error on unsupported ops in loop cond.
 
 func @cluster_with_loop() -> () {
@@ -409,7 +365,7 @@ func @cluster_with_case(%arg0: tensor<i32>) -> tensor<4xf32> {
   // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
   %2 = "tf_device.cluster"() ( {
     // CHECK: %[[CASE:.*]]:2 = "tf.Case"(%[[ARG0]], %[[READ0]], %[[READ1]])
-    %3:2 = "tf.Case"(%arg0, %0, %1) {branches = [@branch_0, @branch_1, @branch_2]}
+    %3:2 = "tf.Case"(%arg0, %0, %1) {branches = [@branch_0, @branch_1, @branch_2], is_stateless = false}
       : (tensor<i32>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CASE]]#1, %[[CASE]]#0)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 4a5e3c8deaa..3e613573d42 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -2,69 +2,69 @@
 // RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants -verify-diagnostics | FileCheck %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
-// CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"
- // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
- // CHECK: return %[[RESULT]] : tensor<1xi32>
+    // CHECK: %[[RESULT:.*]] = "tf.AddV2"
+    // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+    // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
     %2 = "tf.AddV2"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
     return %2 : tensor<*xi32>
   }
 
-// CHECK-LABEL: func @simple_chain
+  // CHECK-LABEL: func @simple_chain
   func @simple_chain(%arg0: tensor<1xf32>) -> tensor<*xf32> {
-// CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: return %[[ADD]] : tensor<1xf32>
+    // CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: return %[[ADD]] : tensor<1xf32>
     %0 = "tf.Mul"(%arg0, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
     %1 = "tf.Add"(%0, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     return %1 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @simple_chain_with_broadcast
+  // CHECK-LABEL: func @simple_chain_with_broadcast
   func @simple_chain_with_broadcast(%arg0: tensor<1xf32>, %arg1: tensor<10xf32>) -> tensor<*xf32> {
-// CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<10xf32>) -> tensor<10xf32>
-// CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-// CHECK: %[[CAST:.*]] = "tf.Cast"(%[[ADD]]) {{.*}} : (tensor<10xf32>) -> tensor<*xf32>
-// CHECK: %[[UNKNOWN:.*]] = addf %[[CAST]], %[[CAST]] : tensor<*xf32>
-// CHECK: return %[[UNKNOWN]] : tensor<*xf32>
+    // CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<10xf32>) -> tensor<10xf32>
+    // CHECK: %[[ADD:.*]] = "tf.Add"(%[[MUL]], %[[MUL]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+    // CHECK: %[[CAST:.*]] = "tf.Cast"(%[[ADD]]) {{.*}} : (tensor<10xf32>) -> tensor<*xf32>
+    // CHECK: %[[UNKNOWN:.*]] = addf %[[CAST]], %[[CAST]] : tensor<*xf32>
+    // CHECK: return %[[UNKNOWN]] : tensor<*xf32>
     %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xf32>, tensor<10xf32>) -> tensor<*xf32>
     %1 = "tf.Add"(%0, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     %2 = addf %1, %1 : tensor<*xf32>
     return %2 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @unknown_op
+  // CHECK-LABEL: func @unknown_op
   func @unknown_op(%arg0: tensor<1xf32>) -> tensor<*xf32> {
-// CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: %[[UNKNOWN:.*]] = "tf.Unknown"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
-// CHECK: return %[[UNKNOWN]] : tensor<*xf32>
+    // CHECK: %[[MUL:.*]] = "tf.Mul"{{.*}} (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: %[[UNKNOWN:.*]] = "tf.Unknown"(%[[MUL]], %[[MUL]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
+    // CHECK: return %[[UNKNOWN]] : tensor<*xf32>
     %0 = "tf.Mul"(%arg0, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
     %1 = "tf.Unknown"(%0, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     return %1 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<?xf32>
-func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
-  br ^bb1
-^bb1:
-// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK: return %[[IDENTITY]] : tensor<?xf32>
-  %ret = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<*xf32>
-  return %ret : tensor<*xf32>
-}
+  // CHECK-LABEL: func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<?xf32>
+  func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
+    br ^bb1
+  ^bb1:
+  // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: return %[[IDENTITY]] : tensor<?xf32>
+    %ret = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<*xf32>
+    return %ret : tensor<*xf32>
+  }
 
 
-// Tests the case where an inference opportunity relies on folding.
+  // Tests the case where an inference opportunity relies on folding.
 
-// CHECK-LABEL: func @simple_folding
+  // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[SHAPE:.*]] = "tf.Shape"
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
-// CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-// CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
+    // CHECK: %[[SHAPE:.*]] = "tf.Shape"
+    // CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
+    // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+    // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
     %1 = "tf.Conv2DBackpropInput"(%0, %arg1, %arg1) {
       padding = "VALID", strides = [1, 1, 1, 1]
@@ -72,7 +72,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %1 : tensor<?x?x?x?xf32>
   }
 
-// Tests where tf.Const's value needs to be refined.
+  // Tests where tf.Const's value needs to be refined.
 
   func @const_refine() -> tensor<*xi32> {
     %0 = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<*xi32>
@@ -81,9 +81,9 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
-// Tests the case where an op's shape function returns non-fully-defined shapes.
+  // Tests the case where an op's shape function returns non-fully-defined shapes.
 
-// CHECK-LABEL: func @op_non_fully_defined_shape_fn
+  // CHECK-LABEL: func @op_non_fully_defined_shape_fn
   func @op_non_fully_defined_shape_fn(%arg0: tensor<0xi32>, %arg1: tensor<0xi32>) -> tensor<?xi32> {
     // CHECK: tf.BroadcastGradientArgs
     // CHECK-SAME: (tensor<0xi32>, tensor<0xi32>) -> (tensor<?xi32>, tensor<?xi32>)
@@ -91,7 +91,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %2#0 : tensor<?xi32>
   }
 
-// CHECK-LABEL: func @shape_from_const_input
+  // CHECK-LABEL: func @shape_from_const_input
   func @shape_from_const_input(%arg0: tensor<3x3x32x64xf32>, %arg1: tensor<200x24x24x64xf32>) -> tensor<?x?x?x?xf32> {
     %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<4xi32>
     // CHECK: tf.Conv2DBackpropInput
@@ -223,7 +223,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
   // CHECK-SAME:    %[[ARG_1:.*]]: tensor<!tf.resource<tensor<1x2x3xf32>>>
   func @shape_from_case_to_branch_functions(%arg0: tensor<i32>, %arg1: tensor<!tf.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32> {
     // CHECK: %[[CASE:.*]] = "tf.Case"(%[[ARG_0]], %[[ARG_1]])
-    %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch_0, @branch_1]} : (tensor<i32>, tensor<!tf.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32>
+    %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch_0, @branch_1], is_stateless = false} : (tensor<i32>, tensor<!tf.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32>
     // CHECK:           return %[[CASE]] : tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
index 3d187aa5d60..92cb0458bf9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -256,7 +256,7 @@ func @main(%arg0: tensor<i32>) -> () {
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NOT: tf.EmptyTensorList
   %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
-  %case_op = "tf.Case"(%arg0, %tl) {branches = [@branch_0, @branch_1, @branch_2]}
+  %case_op = "tf.Case"(%arg0, %tl) {branches = [@branch_0, @branch_1, @branch_2], is_stateless = false}
     : (tensor<i32>, tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
   // CHECK: "tf.Slice"
   %pop:2 = "tf.TensorListPopBack"(%case_op, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 20a0e22c48e..9a8d97eddf1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -775,12 +775,30 @@ func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 // -----
 
 func @testIfThen(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-func @testIfElse(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
 
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{branches should have 1 inputs}}
+  // expected-error @+1 {{expects all branches to have 1 input(s), but 'then_branch' has 2 input(s)}}
+  %1 = "tf.If"(%arg0, %arg1) {
+    then_branch = @testIfThen,
+    else_branch = @testIfElse,
+    is_stateless = false
+  } : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %1 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfThen(tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
+
+// Test invalid tf.If operation
+func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
+^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
+  // expected-error @+1 {{expects all branches to have 1 result(s), but 'then_branch' has 2 result(s)}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -798,7 +816,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<*xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{then branch input type tensor<*xf16> is incompatible with operand type tensor<2xf32>}}
+  // expected-error @+1 {{expects operand type 'tensor<2xf32>' to be cast compatible with 'then_branch' input type 'tensor<*xf16>' at index 0}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -816,7 +834,7 @@ func @testIfElse(tensor<3xf32>) -> tensor<*xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>):
-  // expected-error @+1 {{branches inputs have incompatible types tensor<2xf32> and tensor<3xf32>}}
+  // expected-error @+1 {{expects all branch input type(s) (tensor<2xf32>, tensor<3xf32>) at index 0 to be cast compatible}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -834,7 +852,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<3xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>):
-  // expected-error @+1 {{else branch result type tensor<3xf32> is incompatible with op result type tensor<2xf32>}}
+  // expected-error @+1 {{expects result type 'tensor<2xf32>' to be cast compatible with 'else_branch' result type 'tensor<3xf32>' at index 0}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -848,7 +866,7 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // Test invalid tf.Yield operation (parent should be IfRegion)
 func @testInvalidYieldOp(%arg0: f32) -> () {
-  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.IfRegion, tf.WhileRegion'}}
+  // expected-error @+1 {{'tf.Yield' op expects parent op to be one of 'tf.CaseRegion, tf.IfRegion, tf.WhileRegion'}}
   "tf.Yield"(%arg0) : (f32) -> ()
 }
 
@@ -895,7 +913,7 @@ func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2x
 
 // Test invalid type for operand #0 for tf.IfRegion operation
 func @testInvalidIfRegionOpType0(%arg0: f32, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{operand #0 must be tensor of tf.dtype values}}
+  // expected-error @+1 {{operand #0 must be 0D tensor of 1-bit signless integer values, but got 'f32'}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -2033,6 +2051,15 @@ func @testConst() -> tensor<f32> {
 
 // -----
 
+// Test invalid tf.ToBool
+func @testInvalidToBool(%arg0: tensor<i32>) -> tensor<1xi1> {
+  // expected-error @+1 {{op result #0 must be 0D tensor of 1-bit signless integer values, but got 'tensor<1xi1>'}}
+  %0 = "tf.ToBool"(%arg0) : (tensor<i32>) -> tensor<1xi1>
+  return %0 : tensor<1xi1>
+}
+
+// -----
+
 // Test valid tf.Transpose
 // CHECK-LABEL: testTranspose
 func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
@@ -3313,3 +3340,131 @@ func @testBatchToSpaceInvalidOutputDepth(%arg0: tensor<16x8x8x3xf32>, %arg1: ten
   %0 = "tf.BatchToSpace"(%arg0, %arg1) {block_size = 2 : i64} : (tensor<16x8x8x3xf32>, tensor<*xi32>) -> tensor<4x8x8x8xf32>
   return
 }
+
+// -----
+
+func @branch()
+
+func @testCaseBadBranchIndicesShape(%arg0: tensor<8xi32>) {
+  // expected-error @+1 {{expects 'branch_index' to be a scalar, but got 'tensor<8xi32>'}}
+  "tf.Case"(%arg0) {branches = [@branch], is_stateless = false} : (tensor<8xi32>) -> ()
+  return
+}
+
+// -----
+
+func @branch0(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+func @branch1(tensor<2xf32>) -> tensor<2xf32>
+
+func @testCaseMismatchedNumOperands(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{expects all branches to have 1 input(s), but branch #0 has 2 input(s)}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
+func @branch1(tensor<2xf32>) -> tensor<2xf32>
+
+func @testCaseMismatchedNumResults(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{expects all branches to have 1 result(s), but branch #0 has 2 result(s)}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<*xf16>) -> tensor<*xf32>
+func @branch1(tensor<*xf32>) -> tensor<*xf32>
+
+func @testCaseOperandNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{expects operand type 'tensor<2xf32>' to be cast compatible with branch #0 input type 'tensor<*xf16>' at index 0}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<2xf32>) -> tensor<*xf32>
+func @branch1(tensor<3xf32>) -> tensor<*xf32>
+
+func @testCaseBranchArgumentsNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{expects all branch input type(s) (tensor<2xf32>, tensor<3xf32>) at index 0 to be cast compatible}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<*xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @branch0(tensor<*xf32>) -> tensor<*xf32>
+func @branch1(tensor<*xf32>) -> tensor<3xf32>
+
+func @testCaseResultNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{expects result type 'tensor<2xf32>' to be cast compatible with branch #1 result type 'tensor<3xf32>' at index 0}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<*xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testCaseRegionNoRegions(%arg0: tensor<i32>) {
+  // expected-error @+1 {{expects to have at least 1 region}}
+  "tf.CaseRegion"(%arg0) {is_stateless = false} : (tensor<i32>) -> ()
+  return
+}
+
+// -----
+
+func @testCaseRegionBadBranchIndicesShape(%arg0: tensor<8xi32>) {
+  // expected-error @+1 {{expects 'branch_index' to be a scalar, but got 'tensor<8xi32>'}}
+  "tf.CaseRegion"(%arg0) ( {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false} : (tensor<8xi32>) -> ()
+  return
+}
+
+// -----
+
+func @testCaseRegionMismatchedNumResults(%arg0: tensor<i32>) {
+  // expected-error @+1 {{region #0 should have same number (1) of results as tf.CaseRegion but has 0 results}}
+  %1 = "tf.CaseRegion"(%arg0) ( {
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
+  return
+}
+
+// -----
+
+func @testCaseRegionMismatchedResultTypes(%arg0: tensor<i32>, %arg1: tensor<f32>) {
+  // expected-error @+1 {{region #0 result type tensor<f32> is incompatible with tf.CaseRegion result type tensor<i1> at index 0}}
+  %1 = "tf.CaseRegion"(%arg0) ( {
+    "tf.Yield"(%arg1) : (tensor<f32>) -> ()
+  }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
+  return
+}
+
+// -----
+
+// Test valid tf.Cumsum
+func @testCumsum(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<8x16xf32> {
+  %0 = "tf.Cumsum"(%arg, %axis) : (tensor<8x16xf32>, tensor<i32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// -----
+
+func @testCumprod(%arg: tensor<8x16xf32>, %axis: tensor<2xi32>) -> tensor<8x16xf32> {
+  // expected-error @+1 {{requires scalar axis operand}}
+  %0 = "tf.Cumprod"(%arg, %axis) : (tensor<8x16xf32>, tensor<2xi32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
+
+// -----
+
+func @testCumprod(%arg: tensor<8x16xf32>) -> tensor<8x16xf32> {
+  %axis = constant dense<-3> : tensor<i32>
+  // expected-error @+1 {{axis operand should be within range [-2, 2)}}
+  %0 = "tf.Cumprod"(%arg, %axis) : (tensor<8x16xf32>, tensor<i32>) -> tensor<8x16xf32>
+  return %0 : tensor<8x16xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir
index 7fc2b210f91..11ceac1fe99 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_index_selector.mlir
@@ -9,17 +9,17 @@ func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32
   // CHECK:  return %[[first]],
   %0 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
   %1 = "tf.DeviceIndex"() {device = "", device_names = ["CPU", "GPU"]} : () -> tensor<i32>
-  %4 = "tf.Case"(%1, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %4 = "tf.Case"(%1, %arg0, %arg1) {branches = [@sub, @add], output_shapes = [#tf.shape<>], is_stateless = false} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
 
   return %0, %4 : tensor<i32>, tensor<f32>
 }
 
-func @add(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @add(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Add"(%arg0, %arg1): (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
-func @sub(%i: tensor<i32>, %arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+func @sub(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
index 9467f890419..7b670cd831c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
@@ -11,9 +11,9 @@ func @non_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
   // CHECK: %[[ITER:.*]]:2 = "tf.IteratorGetNext"
@@ -31,7 +31,7 @@ func @non_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}
   // CHECK-NEXT: "tf.TPUExecute"(%[[COPY0]], %[[COPY1]], %[[COMPILE]]#1)
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -49,9 +49,9 @@ func @multiple_compile_uses(%arg0: tensor<*x!tf.resource> {tf.device = "/device:
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"}
@@ -62,13 +62,13 @@ func @multiple_compile_uses(%arg0: tensor<*x!tf.resource> {tf.device = "/device:
   }) {device = "/device:CPU:0"} : () -> ()
   %execute0 = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   %4:2 = "tf._UnKnownOp_"() : () -> (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>)
   %execute1 = "tf_device.launch"() ( {
     %5 = "tf.TPUExecute"(%4#0, %4#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %5 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute1 : tensor<i32>
@@ -85,9 +85,9 @@ func @on_tpu_iter(%arg0: tensor<*x!tf.resource> {tf.device = "/device:TPU:0"}) -
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:TPU:0"}
@@ -98,7 +98,7 @@ func @on_tpu_iter(%arg0: tensor<*x!tf.resource> {tf.device = "/device:TPU:0"}) -
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -116,9 +116,9 @@ func @arg_on_tpu_iter_on_cpu(%arg0: tensor<*x!tf.resource> {tf.device = "/device
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2:2 = "tf.IteratorGetNext"(%arg0) {device = "/device:CPU:0"}
@@ -129,7 +129,7 @@ func @arg_on_tpu_iter_on_cpu(%arg0: tensor<*x!tf.resource> {tf.device = "/device
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -148,9 +148,9 @@ func @arg_on_tpu_intermediate_ops_on_cpu(%arg0: tensor<*x!tf.resource> {tf.devic
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   %id1 = "tf.Identity"(%arg0) {device = "/device:CPU:0"} : (tensor<*x!tf.resource>) -> (tensor<*x!tf.resource>)
   %id2 = "tf.Identity"(%id1) {device = "/device:CPU:0"} : (tensor<*x!tf.resource>) -> (tensor<*x!tf.resource>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
@@ -163,7 +163,7 @@ func @arg_on_tpu_intermediate_ops_on_cpu(%arg0: tensor<*x!tf.resource> {tf.devic
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -181,9 +181,9 @@ func @var_handle_on_tpu_iter_on_cpu() -> tensor<i32> {
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   %var = "tf.VarHandleOp"() {container = "c", shared_name = "v", device = "/device:TPU:0"} : () -> tensor<*x!tf.resource>
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
@@ -195,7 +195,7 @@ func @var_handle_on_tpu_iter_on_cpu() -> tensor<i32> {
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%2#0, %2#1, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -212,9 +212,9 @@ func @unsupported_ops(%arg0: tensor<3x3x1x32xf32> {tf.device = "/device:CPU:0"})
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   %2 = "tf._Unknown_"() : () -> tensor<3x3x1x32xf32>
@@ -224,7 +224,7 @@ func @unsupported_ops(%arg0: tensor<3x3x1x32xf32> {tf.device = "/device:CPU:0"})
   }) {device = "/device:CPU:0"} : () -> ()
   %execute = "tf_device.launch"() ( {
     %3 = "tf.TPUExecute"(%arg0, %2, %compile#1)
-      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) {device = "/device:TPU:0"} : () -> tensor<i32>
   return %execute : tensor<i32>
@@ -246,9 +246,9 @@ func @replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}) ->
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 1 : i64, is_output = false}
   // CHECK: %[[ITER1:.*]]:2 = "tf.IteratorGetNext"
@@ -267,7 +267,7 @@ func @replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0"}) ->
       {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}} {
     // CHECK: "tf.TPUExecute"(%[[R0]], %[[R1]], %[[COMPILE]]#1)
     %execute = "tf_device.launch"() ( {
-      %4 = "tf.TPUExecute"(%r0, %r1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      %4 = "tf.TPUExecute"(%r0, %r1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
       tf_device.return %4 : tensor<i32>
     }) {device = "TPU_REPLICATED_CORE_0"} : () -> tensor<i32>
     tf_device.return %execute : tensor<i32>
@@ -286,9 +286,9 @@ func @inside_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:
       NumDynamicShapes = 0 : i64,
       // The metadata encodes 2 parameter and two return values.
       metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+      mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1 : tensor<!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
   // CHECK-NOT: "tf.TPUGetLayoutOp"
   // CHECK-NOT: "tf.TPUCopyWithLayout"
   "tf_device.launch"() ( {
@@ -300,7 +300,7 @@ func @inside_replicated(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:
     %2:2 = "tf.IteratorGetNext"(%r0)
       : (tensor<*x!tf.resource>) -> (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>)
     %execute = "tf_device.launch"() ( {
-      %4 = "tf.TPUExecute"(%2#0, %2#1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<!tf.string>) -> tensor<i32>
+      %4 = "tf.TPUExecute"(%2#0, %2#1, %compile#1) : (tensor<3x3x1x32xf32>, tensor<3x3x1x32xf32>, tensor<2x!tf.string>) -> tensor<i32>
       tf_device.return %4 : tensor<i32>
     }) {device = "TPU_REPLICATED_CORE_0"} : () -> tensor<i32>
     tf_device.return %execute : tensor<i32>
@@ -330,9 +330,9 @@ func @parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0
   // CHECK: %[[COMPILE:.*]]:3 = "tf_device.launch"
   // CHECK-NEXT: "tf._TPUCompileMlir"()
   %compile:3 = "tf_device.launch"() ( {
-    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
+    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) {index = 0 : i64, is_output = false}
   // CHECK: %[[ITER:.*]]:2 = "tf.IteratorGetNext"
@@ -351,7 +351,7 @@ func @parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0
     // CHECK-NEXT: tf_device.return
     // CHECK-NEXT: device = "/device:TPU:0"
     "tf_device.launch"() ( {
-      "tf.TPUExecute"(%2#0, %compile#1) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+      "tf.TPUExecute"(%2#0, %compile#1) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
       tf_device.return
     }) {device = "/device:TPU:0"} : () -> ()
     tf_device.return
@@ -364,7 +364,7 @@ func @parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/device:CPU:0
     // CHECK-NEXT: tf_device.return
     // CHECK-NEXT: device = "/device:TPU:1"
     "tf_device.launch"() ( {
-      "tf.TPUExecute"(%2#1, %compile#2) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+      "tf.TPUExecute"(%2#1, %compile#2) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
       tf_device.return
     }) {device = "/device:TPU:1"} : () -> ()
     tf_device.return
@@ -396,9 +396,9 @@ func @replicated_parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/d
   // CHECK: %[[COMPILE:.*]]:3 = "tf_device.launch"
   // CHECK-NEXT: "tf._TPUCompileMlir"()
   %compile:3 = "tf_device.launch"() ( {
-    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\02 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
-    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>
-  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>, tensor<!tf.string>)
+    %1:3 = "tf._TPUCompileMlir"() {NumDynamicShapes = 0 : i64, metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\02 \02", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
+    tf_device.return %1#0, %1#1, %1#2 : tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>
+  }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>, tensor<2x!tf.string>)
   // CHECK-DAG: %[[LAYOUT0:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#1) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[LAYOUT1:.*]] = "tf.TPUGetLayoutOp"(%[[COMPILE]]#2) {index = 0 : i64, is_output = false}
   // CHECK-DAG: %[[ITER0:.*]]:2 = "tf.IteratorGetNext"(%[[ARG0]])
@@ -423,7 +423,7 @@ func @replicated_parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/d
       // CHECK-NEXT: tf_device.return
       // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
       "tf_device.launch"() ( {
-        "tf.TPUExecute"(%r0, %compile#1) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+        "tf.TPUExecute"(%r0, %compile#1) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       tf_device.return
@@ -433,7 +433,7 @@ func @replicated_parallel_execute(%arg0: tensor<*x!tf.resource> {tf.device = "/d
       // CHECK-NEXT: tf_device.return
       // CHECK-NEXT: device = "TPU_REPLICATED_CORE_1"
       "tf_device.launch"() ( {
-        "tf.TPUExecute"(%r1, %compile#2) : (tensor<128xf32>, tensor<!tf.string>) -> ()
+        "tf.TPUExecute"(%r1, %compile#2) : (tensor<128xf32>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_1"} : () -> ()
       tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index 1e308b42bfc..277e4a8415e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -61,9 +61,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -86,7 +86,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       "tf_device.launch"() ( {
         "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
               {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -153,9 +153,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -173,7 +173,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         "tf.TPUExecuteAndUpdateVariables"(%arg30, %arg31, %arg32, %compile#1)
               {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
                 : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-                   tensor<*x!tf.resource<tensor<f32>>>, tensor<!tf.string>) -> ()
+                   tensor<*x!tf.resource<tensor<f32>>>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       tf_device.return
@@ -239,9 +239,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -254,7 +254,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         "tf_device.launch"() ( {
           "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
                 {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                  : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+                  : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
           tf_device.return
         }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
         tf_device.return
@@ -342,9 +342,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
         NumDynamicShapes = 0 : i64,
         // The metadata encodes 2 parameter and two return values.
         metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
     "tf_device.launch"() ( {
       "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
       tf_device.return
@@ -367,7 +367,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
       "tf_device.launch"() ( {
         "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
               {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<!tf.string>) -> ()
+                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
         tf_device.return
       }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
       %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 1f516a25824..2271bca7382 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -512,6 +512,137 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return %1 : tensor<?xi32>
   }
 
+  // Tests extraction of an outside compiled tf.IfRegion op where the entirety
+  // of tf.IfRegion op is outside compiled
+
+  // CHECK-LABEL: func @outside_compiled_tf_if
+  func @outside_compiled_tf_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK:      %[[A_OUT:[0-9]*]] = "tf.A"
+    // CHECK:      %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       %[[RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<?xi32>, tensor<i1>)
+    // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT]]#2)
+    // CHECK:              "tf.D"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1, %[[F_OUT]])
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]], %[[G_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       tpu_core = 0
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %7 = "tf.F"() : () -> tensor<?xi32>
+
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.IfRegion op where the entirety
+  // of tf.IfRegion op is outside compiled and wrapped inside another
+  // tf.IfRegion op
+
+  // CHECK-LABEL: func @outside_compiled_tf_if_nested
+  func @outside_compiled_tf_if_nested(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK:      %[[A_OUT:[0-9]*]] = "tf.A"
+    // CHECK:      %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       %[[RECV_OUTPUT_PREDICATE:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:       (tensor<2x!tf.string>) -> tensor<i1>
+    // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT_PREDICATE]])
+    // CHECK-NEXT:         %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<i1>)
+    // CHECK-NEXT:         tf.IfRegion"(%[[RECV_OUTPUT]]#1)
+    // CHECK-NEXT:           "tf.H"(%[[RECV_OUTPUT]]#0, %[[F_OUT]])
+    // CHECK:                "tf.Yield"() : () -> ()
+    // CHECK:                "tf.Yield"() : () -> ()
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]])
+    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:       (tensor<i1>) -> ()
+    // CHECK-NEXT:       "tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK:              %[[D_OUT:[0-9]*]] = "tf.D"
+    // CHECK-NEXT:         %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:              "tf._XlaHostComputeMlir"(%[[D_OUT]], %[[F_OUT]])
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         tpu_core = 0
+    // CHECK:              "tf.Yield"() : () -> ()
+    // CHECK:              "tf.Yield"() : () -> ()
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %7 = "tf.F"() : () -> tensor<?xi32>
+
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          %8 = "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+          %9 = "tf.F"(%4) {} : (tensor<?xi32>) -> (tensor<i1>)
+
+          "tf.IfRegion"(%9) ({
+            "tf.H"(%8, %7) : (tensor<?xi32>, tensor<?xi32>) -> ()
+            "tf.Yield"() : () -> ()
+          }, {
+            "tf.Yield"() : () -> ()
+          }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) {is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
   // Tests extraction of a single outside compiled cluster inside a tf.IfRegion
   // op with return values.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_identity_pruning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_identity_pruning.mlir
new file mode 100644
index 00000000000..317e7036c42
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_identity_pruning.mlir
@@ -0,0 +1,93 @@
+// RUN: tf-opt %s -tf-tpu-identity-pruning | FileCheck %s --dump-input=always
+
+// Tests Identity op in cluster is pruned away.
+
+// CHECK-LABEL: func @testIdentity
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @testIdentity(%arg0: tensor<i32>) {
+  // CHECK-NOT:  "tf.Identity"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: tf_device.return [[ARG0]]
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+    tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
+  return
+}
+
+// Tests IdentityN op in cluster is pruned away.
+
+// CHECK-LABEL: func @testIdentityN
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>, [[ARG1:%.*]]: tensor<f32>)
+func @testIdentityN(%arg0: tensor<i32>, %arg1: tensor<f32>) {
+  // CHECK-NOT:  "tf.IdentityN"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG1]]
+  %0:2 = "tf_device.cluster"() ( {
+    %1:2 = "tf.IdentityN"(%arg0, %arg1) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+    tf_device.return %1#0, %1#1 : tensor<i32>, tensor<f32>
+  }) : () -> (tensor<i32>, tensor<f32>)
+  return
+}
+
+// Tests transitive Identity ops reachable from the cluster are pruned away.
+
+// CHECK-LABEL: func @testTransitiveIdentity
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @testTransitiveIdentity(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      "tf.PartitionedCall"([[ARG0]])
+  // CHECK-SAME: f = @callee0
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @callee0} : (tensor<i32>) -> tensor<i32>
+    tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
+  return
+}
+
+// CHECK-LABEL: func @callee0
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @callee0(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT:  "tf.Identity"
+  // CHECK:      "tf.PartitionedCall"([[ARG0]])
+  // CHECK-SAME: f = @callee1
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @callee1} : (tensor<i32>) -> tensor<i32>
+  return %1 : tensor<i32>
+}
+
+// CHECK-LABEL: func @callee1
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @callee1(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT:  "tf.Identity"
+  // CHECK:      return [[ARG0]]
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// Tests Identity ops not reachable from the cluster are not pruned away.
+
+// CHECK-LABEL: func @testIdentityOutsideCluster
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @testIdentityOutsideCluster(%arg0: tensor<i32>) {
+  // CHECK:      [[IDENTITY:%.*]] = "tf.Identity"([[ARG0]])
+  // CHECK:      [[CLUSTER:%.*]] = "tf_device.cluster"
+  // CHECK-NEXT: tf_device.return [[IDENTITY]]
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  %1 = "tf_device.cluster"() ( {
+    tf_device.return %0 : tensor<i32>
+  }) : () -> tensor<i32>
+  // CHECK:      "tf.PartitionedCall"([[CLUSTER]])
+  // CHECK-SAME: f = @callee2
+  %2 = "tf.PartitionedCall"(%1) {config = "", config_proto = "", executor_type = "", f = @callee2} : (tensor<i32>) -> tensor<i32>
+  return
+}
+
+// CHECK-LABEL: func @callee2
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>)
+func @callee2(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:      [[IDENTITY:%.*]] = "tf.Identity"([[ARG0]])
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK:      return [[IDENTITY]]
+  return %0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 2a0091ce9bf..ef7b52cd978 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1262,15 +1262,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK-NOT:"tf._TPUCompileMlirPlaceholderProgramKey"
       // CHECK:    "tf.E"(%[[COMPILE_OUTPUT]]#1
       %3 = "tf_device.parallel_execute"() ( {
-         %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<?x!tf.string>
-        "tf.D"(%program) : (tensor<?x!tf.string>) -> ()
+         %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf.string>
+        "tf.D"(%program) : (tensor<2x!tf.string>) -> ()
         tf_device.return
       }, {
         %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %4 : tensor<?xi32>
       }, {
-        %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<?x!tf.string>
-        "tf.E"(%program) : (tensor<?x!tf.string>) -> ()
+        %program = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf.string>
+        "tf.E"(%program) : (tensor<2x!tf.string>) -> ()
         tf_device.return
       }) : () -> (tensor<?xi32>)
       tf_device.return %3 : tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir b/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir
index 1770b4e146d..8cc8d273bec 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/visitor-interrupt-util.mlir
@@ -69,7 +69,7 @@ func @foo(%arg0: tensor<f32>) -> tensor<f32> {
 // Test static filtering
 // expected-remark@below {{0: before all regions}}
 // expected-remark@below {{7: walk was interrupted}}
-func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+func @foo(%arg0: tensor<f32>, %arg1: tensor<i1>) -> tensor<f32> {
   // expected-remark@below {{1: before all regions}}
   %cst = constant dense<1.0> : tensor<f32>
   // expected-remark@below {{2: before all regions}}
@@ -77,7 +77,7 @@ func @foo(%arg0: tensor<f32>) -> tensor<f32> {
   // expected-remark@below {{8: before all regions}}
   // expected-remark@below {{9: before region #1}}
   // expected-remark@below {{10: after all regions}}
-  %0 = "tf.IfRegion"(%arg0) ({
+  %0 = "tf.IfRegion"(%arg1) ({
     // expected-remark@below {{3: before all regions}}
     %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
     // expected-remark@below {{4: before all regions}}
@@ -86,6 +86,6 @@ func @foo(%arg0: tensor<f32>) -> tensor<f32> {
     // expected-remark@below {{6: before all regions}}
     %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
     "tf.Yield"(%1) { interrupt_after_all = true } : (tensor<f32>) -> ()
-  }) {is_stateless = true}: (tensor<f32>) -> tensor<f32>
+  }) {is_stateless = true}: (tensor<i1>) -> tensor<f32>
   return %0 : tensor<f32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir b/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir
index d376fad5c33..9a832b7fe8d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/visitor-util.mlir
@@ -77,7 +77,7 @@ func @foo(%arg0: tensor<f32>) -> tensor<f32> {
 // Test static filtering
 // expected-remark@below {{0: before all regions}}
 // expected-remark@below {{10: after all regions}}
-func @foo(%arg0: tensor<f32>) -> tensor<f32> {
+func @foo(%arg0: tensor<f32>, %arg1: tensor<i1>) -> tensor<f32> {
   // expected-remark@below {{1: before all regions}}
   %cst = constant dense<1.0> : tensor<f32>
   // expected-remark@below {{2: before all regions}}
@@ -86,7 +86,7 @@ func @foo(%arg0: tensor<f32>) -> tensor<f32> {
   // expected-remark@below {{11: before all regions}}
   // expected-remark@below {{12: before region #1}}
   // expected-remark@below {{13: after all regions}}
-  %0 = "tf.IfRegion"(%arg0) ({
+  %0 = "tf.IfRegion"(%arg1) ({
     // expected-remark@below {{3: before all regions}}
     %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
     // expected-remark@below {{4: before all regions}}
@@ -96,7 +96,7 @@ func @foo(%arg0: tensor<f32>) -> tensor<f32> {
     %1 = "tf.Identity"(%arg0) : (tensor<f32>) -> tensor<f32>
     // expected-remark@below {{7: before all regions}}
     "tf.Yield"(%1) : (tensor<f32>) -> ()
-  }) {is_stateless = true}: (tensor<f32>) -> tensor<f32>
+  }) {is_stateless = true}: (tensor<i1>) -> tensor<f32>
   // expected-remark@below {{9: before all regions}}
   return %0 : tensor<f32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 5b0a4b4e619..0c21078b0ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -95,16 +95,18 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
     func_pm.addPass(CreateTPUHostComputationExpansionPass());
     func_pm.addPass(CreateTPUUpdateEmbeddingEnqueueOpInputsPass());
   }
-  pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(CreateTPUExtractHeadTailOutsideCompilationPass());
-  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
-
   // Run another shape inference pass because resource decomposition might have
   // created new partial types.
   pm.addPass(TF::CreateTFShapeInferencePass());
-  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorConstantSinkingPass());
   pm.addPass(TFDevice::CreateResourceOpLiftingPass());
+  pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(TFDevice::CreateMarkOpsForOutsideCompilationPass());
+  pm.addPass(CreateTPUExtractHeadTailOutsideCompilationPass());
+  pm.addPass(CreateTPUExtractOutsideCompilationPass());
+  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
+
+  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorConstantSinkingPass());
   pm.addPass(TF::CreateResourceDeviceInferencePass());
   pm.addPass(TFDevice::CreateClusterOutliningPass());
   pm.addPass(CreateTPUDynamicPaddingMapperPass());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 1429e2b3fd4..3005c78c54f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
@@ -68,7 +69,7 @@ static bool ShouldBeFolded(Operation* inst) {
 
 LogicalResult ConstantFoldFallbackHook(
     Operation* inst, ArrayRef<Attribute> operands,
-    SmallVectorImpl<Attribute>& results) {  // NOLINT
+    SmallVectorImpl<OpFoldResult>& results) {  // NOLINT
   // Instructions with side effects should not be constant folded to preserve
   // the original semantics.
   if (inst->getNumRegions() != 0 || !MemoryEffectOpInterface::hasNoEffect(inst))
@@ -126,8 +127,16 @@ LogicalResult ConstantFoldFallbackHook(
   // TODO(jpienaar): Avoid using global context & mutex here.
   static auto* mu = new tensorflow::mutex();
   tensorflow::mutex_lock l(*mu);
-  return tensorflow::EvaluateOperation(inst, inputs, ctx, &results);
+  SmallVector<Attribute, 8> constants;
+  LogicalResult status =
+      tensorflow::EvaluateOperation(inst, inputs, ctx, &constants);
+  results.assign(constants.begin(), constants.end());
+  return status;
 }
 
+static bool init_hooks = ([] () {
+  TensorFlowDialect::RegisterConstantFoldHook(ConstantFoldFallbackHook);
+}(), true);
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
index 69e39080965..887eea745e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h
@@ -27,7 +27,7 @@ namespace TF {
 
 LogicalResult ConstantFoldFallbackHook(
     Operation *inst, ArrayRef<Attribute> operands,
-    SmallVectorImpl<Attribute> &results);  // NOLINT
+    SmallVectorImpl<OpFoldResult> &results);  // NOLINT
 
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/dialect_hooks.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
similarity index 74%
rename from tensorflow/compiler/mlir/tensorflow/transforms/dialect_hooks.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
index 109ceea47e7..d309c6d379f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/dialect_hooks.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/DialectHooks.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -35,31 +34,22 @@ namespace {
 
 // Since this method is passed to MLIR as decode hook it has to conform
 // to LLVM style used by MLIR.
-bool DecodeOpaqueTensorHook(const OpaqueElementsAttr input,
-                            ElementsAttr& output) {  // NOLINT
+LogicalResult DecodeOpaqueTensorHook(const OpaqueElementsAttr input,
+                                     ElementsAttr& output) {  // NOLINT
   Builder builder(input.getType().getContext());
   auto decoded_attr_or = tensorflow::DecodeOpaqueTensor(input, builder);
   if (!decoded_attr_or.ok()) {
     VLOG(2) << decoded_attr_or.status().error_message();
-    return true;
+    return failure();
   }
 
   output = decoded_attr_or.ValueOrDie();
-  return false;
+  return success();
 }
 
-// Hooks for the TensorFlow dialect.
-class TensorFlowHooks : public DialectHooks {
- public:
-  DialectConstantFoldHook getConstantFoldHook() {
-    return TF::ConstantFoldFallbackHook;
-  }
-  DialectConstantDecodeHook getDecodeHook() { return DecodeOpaqueTensorHook; }
-};
+static bool init_hooks = ([] () {
+  TF::TensorFlowDialect::RegisterDecodeConstantHook(DecodeOpaqueTensorHook);
+}(), true);
 
 }  // anonymous namespace
-
-// Static initialization for TensorFlow dialect hooks registration.
-static DialectHooksRegistration<TensorFlowHooks> tf_hooks_registration("tf");
-
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index b47378762a9..cc24c98a786 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -240,7 +240,7 @@ static LogicalResult FoldMergeNodes(FuncOp function, const DeadQueue& queue) {
       auto def_op = val.getDefiningOp();
 #ifndef NDEBUG
       auto exec_dialect =
-          function.getContext()->getRegisteredDialect("tf_executor");
+          function.getContext()->getLoadedDialect("tf_executor");
       assert(def_op->getDialect() == exec_dialect &&
              "unable to forward control dependencies");
 #endif
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index d23b977f0e3..11d74e87f96 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
@@ -31,8 +32,8 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define DEBUG_TYPE "tf-functional-cf-to-region"
 
@@ -53,8 +54,8 @@ struct FunctionalControlFlowToRegions
 // the input arguments are used as is (for IfOp) or block arguments of the same
 // type as the input arguments are created and then used as call arguments (for
 // While).
-void CreateCall(Operation* op, FuncOp func, Region& caller_region,
-                ValueRange args, bool use_region_args) {
+YieldOp CreateCall(Operation* op, FuncOp func, Region& caller_region,
+                   ValueRange args, bool use_region_args) {
   assert(caller_region.empty() &&
          "Expected empty region for newly created ops");
   OpBuilder builder(caller_region);
@@ -76,15 +77,26 @@ void CreateCall(Operation* op, FuncOp func, Region& caller_region,
     casted_args.push_back(arg);
   }
   auto call = builder.create<CallOp>(op->getLoc(), func, casted_args);
-  builder.create<YieldOp>(op->getLoc(), call.getResults());
+  return builder.create<YieldOp>(op->getLoc(), call.getResults());
+}
+
+// Converts the condition for an IfOp/WhileOp to a boolean value.
+Value ConvertConditionToBoolean(Operation* op, Value cond) {
+  if (auto ranked_type = cond.getType().dyn_cast<RankedTensorType>())
+    if (ranked_type.getRank() == 0 &&
+        ranked_type.getElementType().isSignlessInteger(1))
+      return cond;
+
+  OpBuilder builder(op);
+  return builder.create<TF::ToBoolOp>(op->getLoc(), cond);
 }
 
 // Transform a functional IfOp to a region based IfRegionOp.
 LogicalResult ConvertIfOp(IfOp if_op) {
+  Value cond = ConvertConditionToBoolean(if_op, if_op.cond());
   auto if_region = OpBuilder(if_op).create<TF::IfRegionOp>(
-      if_op.getLoc(), if_op.getResultTypes(), if_op.cond(),
-      if_op.is_stateless());
-  CopyUnderscoredAttributes(if_op, if_region);
+      if_op.getLoc(), if_op.getResultTypes(), cond, if_op.is_stateless());
+  CopyDeviceAndUnderscoredAttributes(if_op, if_region);
 
   CreateCall(if_op, if_op.then_func(),
              /*caller_region=*/if_region.then_branch(), if_op.input(),
@@ -101,11 +113,16 @@ LogicalResult ConvertWhileOp(WhileOp while_op) {
   auto while_region = OpBuilder(while_op).create<TF::WhileRegionOp>(
       while_op.getLoc(), while_op.getResultTypes(), while_op.input(),
       while_op.is_stateless(), while_op.parallel_iterations());
-  CopyUnderscoredAttributes(while_op, while_region);
+  CopyDeviceAndUnderscoredAttributes(while_op, while_region);
+
+  YieldOp cond_yield =
+      CreateCall(while_op, while_op.cond_func(),
+                 /*caller_region=*/while_region.cond(), while_op.input(),
+                 /*use_region_args=*/true);
+  Value i1_cond =
+      ConvertConditionToBoolean(cond_yield, cond_yield.getOperand(0));
+  cond_yield.setOperand(0, i1_cond);
 
-  CreateCall(while_op, while_op.cond_func(),
-             /*caller_region=*/while_region.cond(), while_op.input(),
-             /*use_region_args=*/true);
   CreateCall(while_op, while_op.body_func(),
              /*caller_region=*/while_region.body(), while_op.input(),
              /*use_region_args=*/true);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
index 175baeb627f..fbe0524ce8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
@@ -91,7 +91,7 @@ struct ReluToFusedBatchNorm : public OpRewritePattern<ReluOp> {
 
     // Build the newly fused operation to replace the batch norm
     OperationState state(batch_norm.getLoc(),
-                         FusedBatchNormExOp::getOperationName());
+                         _FusedBatchNormExOp::getOperationName());
     state.addOperands(batch_norm.getOperands());
     if (side_input) state.operands.push_back(side_input);
     state.addTypes(batch_norm.getResultTypes());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
index 9f67a3e7e71..4e507c8e760 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
@@ -104,7 +104,7 @@ LogicalResult HoistOpsAndAnnotateWithDevice(const Dialect* tf_dialect,
 }
 
 void LaunchToDeviceAttributePass::runOnFunction() {
-  const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
+  const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
   if (!tf_dialect) {
     getFunction().emitError() << "'tf' dialect is not registered";
     return signalPassFailure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index 483c84b3e80..6946dc65104 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -55,18 +56,27 @@ static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
   return DenseIntElementsAttr::get(ty, vals);
 }
 
-// Returns int or float DenseElementsAttr with scalar shape with the given
-// element type and the integer value.
+// Returns int, float, or complex DenseElementsAttr with scalar shape with the
+// given element type and the integer value.
 static DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value) {
   RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
   if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
     FloatAttr attr = FloatAttr::get(float_ty, raw_value);
     return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto int_ty = ty.dyn_cast_or_null<IntegerType>()) {
+    IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
+    return DenseElementsAttr::get(scalar_ty, attr);
+  } else if (auto complex_ty = ty.dyn_cast_or_null<ComplexType>()) {
+    Type complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<float>>(raw_value));
+    } else if (complex_element_ty.isF64()) {
+      return DenseElementsAttr::get(
+          scalar_ty, static_cast<std::complex<double>>(raw_value));
+    }
   }
-
-  auto int_ty = ty.cast<IntegerType>();
-  IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
-  return DenseElementsAttr::get(scalar_ty, attr);
+  llvm_unreachable("unsupported type");
 }
 
 // Returns float DenseElementsAttr with scalar shape with the specified value.
@@ -427,12 +437,38 @@ class LowerSparseMatMulOp : public OpRewritePattern<TF::SparseMatMulOp> {
   }
 };
 
+// Lowers _UnaryOpsComposition op as a series of original TensorFlow ops that
+// were fused together.
+class Lower_UnaryOpsComposition
+    : public OpRewritePattern<TF::_UnaryOpsCompositionOp> {
+ public:
+  using OpRewritePattern<TF::_UnaryOpsCompositionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::_UnaryOpsCompositionOp op,
+                                PatternRewriter &rewriter) const override {
+    Value result = op.x();
+    for (StringRef op_name :
+         op.op_names().getAsRange<StringAttr, StringRef>()) {
+      std::string full_name = "tf." + op_name.str();
+      // All ops in the sequences have the same result type as the original
+      // result type.
+      OperationState state(op.getLoc(), full_name, /*operands=*/{result},
+                           /*types=*/{op.getType()}, /*attributes=*/{});
+      Operation *op = rewriter.createOperation(state);
+      result = op->getResult(0);
+    }
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
 }  // namespace
 
 void PopulateLoweringTFPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
   patterns->insert<LowerAddNOp, LowerDynamicStitchOp, LowerInvertPermutationOp,
-                   LowerPackOp, LowerSparseMatMulOp>(context);
+                   LowerPackOp, LowerSparseMatMulOp, Lower_UnaryOpsComposition>(
+      context);
   populateWithGenerated(context, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index 6b7d7178ab6..f7a867f3130 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -195,8 +195,7 @@ def : Pat<(TF_PadOp TensorOf<[AnySignlessInteger, AnyFloat]>:$input, $paddings),
 // Reciprocal op patterns.
 //===----------------------------------------------------------------------===//
 
-// TODO(hinsu): Support complex and unsigned input types.
-def LowerReciprocal : Pat<(TF_ReciprocalOp TF_SintOrFpTensor:$x),
+def LowerReciprocal : Pat<(TF_ReciprocalOp $x),
                           (TF_DivOp (TF_ConstOp (GetScalarOfType<1> $x)), $x)>;
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
index e538491ae9d..38cbe3f404e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -131,9 +131,28 @@ LogicalResult MarkUncompilableOps(
   return success();
 }
 
+// Unmarks outside compilation for any op that has parents already
+// marked for outside compilation since the child will be extracted
+// anyways.
+void UnmarkChildren(Block* block) {
+  block->walk([&](Operation* op) {
+    if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) return;
+    Operation* iter_op = op;
+    bool remove_attr = false;
+    while (auto* parent_op = iter_op->getParentOp()) {
+      if (parent_op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+        remove_attr = true;
+        break;
+      }
+      iter_op = parent_op;
+    }
+    if (remove_attr) op->removeAttr(kXlaOutsideCompilationAttr);
+  });
+}
+
 void MarkOpsForOutsideCompilation::runOnOperation() {
   auto module = getOperation();
-  const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
+  const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
   if (!tf_dialect) {
     getOperation().emitError() << "'tf' dialect is not registered";
     return signalPassFailure();
@@ -168,6 +187,17 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
   });
 
   if (result.wasInterrupted()) return signalPassFailure();
+
+  module.walk([&](tf_device::ClusterOp cluster) {
+    // Only if `allow_soft_placement` attribute is true should we unmark ops
+    // for outside compilation.
+    auto soft_placement_attr =
+        cluster.getAttrOfType<BoolAttr>(kAllowSoftPlacementAttr);
+    if (!(soft_placement_attr && soft_placement_attr.getValue())) {
+      return;
+    }
+    UnmarkChildren(&cluster.GetBody());
+  });
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index fb2d6e39da3..d93d9ddccaf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -79,6 +79,11 @@ std::unique_ptr<OperationPass<FuncOp>> CreateRewriteTPUEmbeddingOpsPass();
 // Performs specific fusion for GPU targets.
 std::unique_ptr<OperationPass<FuncOp>> CreateGpuOpFusionPass();
 
+// Create a pass that convert ops that copy tensors between devices, e.g.
+// tf.Identity.
+std::unique_ptr<OperationPass<mlir::FuncOp>>
+CreateTensorDeviceCopyConversionPass();
+
 struct LayoutOptimizationPipelineOptions
     : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
   Option<std::string> force_data_format{
@@ -271,6 +276,9 @@ namespace TFTPU {
 // `_tpu_replicate` attribute.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass();
 
+// Creates a pass that removes Identity/IdentityN ops from a cluster.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass();
+
 // Creates a pass that allows TPU program inputs to have layouts determined at
 // run time.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index ba876e08fbb..1e403bff0eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define DEBUG_TYPE "tf-region-cf-to-functional"
 
@@ -158,9 +158,11 @@ void ExtractSingleBlockRegion(Region& region, StringRef name,
 }
 
 // Returns call for region with single call whose result feeds into the
-// terminator of the region. Returns none if the region doesn't contain just
-// call and non-truncting casts ops.
-llvm::Optional<CallOp> IsSingleCallRegion(Region& region) {
+// terminator of the region. if `allow_to_bool` is true, also allows a single
+// ToBoolOp between the region yield and the call. Returns none if the region
+// does not conform to this pattern.
+llvm::Optional<CallOp> IsSingleCallRegion(Region& region,
+                                          bool allow_to_bool = false) {
   if (!llvm::hasSingleElement(region)) return llvm::None;
 
   Block& block = region.front();
@@ -169,31 +171,44 @@ llvm::Optional<CallOp> IsSingleCallRegion(Region& region) {
 
   if (it == block.rend()) return llvm::None;
 
+  // Operation which is expected to consume all the call results.
+  Operation* call_consumer = yield;
+
+  // Allow a single ToBoolOp between the call and the yield (valid only
+  // when the yield has a single operand)
+  if (allow_to_bool && yield.getNumOperands() == 1 && isa<ToBoolOp>(*it)) {
+    if (it->getResult(0) != yield.getOperand(0)) return llvm::None;
+    call_consumer = cast<ToBoolOp>(*it);
+    it++;
+  }
+
   // Check if there is a Call before the Yield.
   CallOp call = dyn_cast<CallOp>(*it++);
   if (!call) return llvm::None;
 
+  // All call results should feed into expected consumer
+  // All results of the call should feed into the yield.
+  if (call.getNumResults() != call_consumer->getNumOperands())
+    return llvm::None;
+
+  for (auto res_it : llvm::zip(call.getResults(), call_consumer->getOperands()))
+    if (std::get<0>(res_it) != std::get<1>(res_it)) return llvm::None;
+
   // There can only be non-truncating cast op's prior to the call.
   for (; it != block.rend(); ++it) {
     CastOp cast = dyn_cast<CastOp>(*it);
     if (!cast || cast.Truncate()) return llvm::None;
   }
 
-  // All results of the call should feed into the yield.
-  if (call.getNumResults() != yield.getNumOperands()) return llvm::None;
-
-  for (auto res_it : llvm::zip(call.getResults(), yield.getOperands()))
-    if (std::get<0>(res_it) != std::get<1>(res_it)) return llvm::None;
-
   return call;
 }
 
-using MatcherFn = function_ref<bool(Value, Region&, Value, Region&)>;
+using ArgMatcherFn = function_ref<bool(Value, Region&, Value, Region&)>;
 
 // Returns whether the arguments of the given 2 calls are match (after looking
 // through cast ops). `matcher` is the predicate used to check if two arguments
 // match.
-bool MatchCallArgs(CallOp first, CallOp second, MatcherFn matcher) {
+bool MatchCallArgs(CallOp first, CallOp second, ArgMatcherFn matcher) {
   if (first.getNumOperands() != second.getNumOperands()) return false;
 
   Region& first_region = *first.getParentRegion();
@@ -225,38 +240,37 @@ struct TrivialTransformInfo {
   // List of callee names (one for each region).
   llvm::SmallVector<StringRef, 2> callee_names;
 
-  // Constructor will analyze the 2 regions.
-  TrivialTransformInfo(Region& first, Region& second, MatcherFn matcher);
+  // Analyzes the given calls (from regions attached to the same parent op) to
+  // check if the parent op be transformed to functional form trivially (i.e.,
+  // reusing existing functions and without outlining). This is possible when
+  // all the regions are single call regions (checked using matchers outside
+  // this class) and the all the calls match using the given argument matcher.
+  //
+  // If such a trivial transformation is possible, stash the relevant
+  // information needed for the transformation, else indicate that a trivial
+  // transformation is not possible by setting `can_transform` to false.
+  TrivialTransformInfo(llvm::Optional<CallOp> first_call,
+                       llvm::Optional<CallOp> second_call,
+                       ArgMatcherFn arg_matcher) {
+    if (!first_call || !second_call) return;
+
+    if (!MatchCallArgs(first_call.getValue(), second_call.getValue(),
+                       arg_matcher))
+      return;
+
+    can_transform = true;
+    callee_names = {first_call.getValue().getCallee(),
+                    second_call.getValue().getCallee()};
+  }
 };
 
-// Analyzes the given set of regions (attached to the same parent op) to check
-// if the parent op be transformed to functional form trivially (i.e., reusing
-// existing functions and without outlining). This is possible when all the
-// regions are single call regions and the all the calls have the same
-// arguments.
-//
-// If such a trivial transformation is possible, stash the relevant information
-// needed for the transformation, else indicate that a trivial transformation is
-// not possible by setting `can_transform` to false.
-TrivialTransformInfo::TrivialTransformInfo(Region& first, Region& second,
-                                           MatcherFn matcher) {
-  auto call0 = IsSingleCallRegion(first);
-  auto call1 = IsSingleCallRegion(second);
-  if (!call0 || !call1) return;
-
-  if (!MatchCallArgs(call0.getValue(), call1.getValue(), matcher)) return;
-
-  can_transform = true;
-  callee_names = {call0.getValue().getCallee(), call1.getValue().getCallee()};
-}
-
 // Transform IfRegionOp to IfOp.
 LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
   llvm::SmallVector<Value, 4> extern_values;
 
   // For IfOp, arguments of calls in the then and else regions match if they
   // are the same value.
-  auto if_matcher = [&](Value first, Region&, Value second, Region&) {
+  auto if_arg_matcher = [&](Value first, Region&, Value second, Region&) {
     if (first != second) return false;
 
     // collect the call arguments post lookup through cast Op's
@@ -264,8 +278,9 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
     return true;
   };
 
-  const TrivialTransformInfo tti(if_region.then_branch(),
-                                 if_region.else_branch(), if_matcher);
+  const TrivialTransformInfo tti(IsSingleCallRegion(if_region.then_branch()),
+                                 IsSingleCallRegion(if_region.else_branch()),
+                                 if_arg_matcher);
 
   std::string then_name, else_name;
 
@@ -293,16 +308,23 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
                              worklist, /*extern_values_passthrough=*/false);
   }
 
+  // Look through ToBool operations for the condition.
+  Value cond = if_region.cond();
+  auto to_bool = dyn_cast_or_null<ToBoolOp>(cond.getDefiningOp());
+  if (to_bool) cond = to_bool.getOperand();
+
   // Once we have the `then` and `else` functions ready (either outlined or
   // existing ones), replace the region based op with a functional control flow
   // op.
   OpBuilder builder(if_region);
   auto if_op = builder.create<IfOp>(
-      if_region.getLoc(), if_region.getResultTypes(), if_region.cond(),
-      extern_values, then_name, else_name, if_region.is_stateless());
-  CopyUnderscoredAttributes(if_region, if_op);
+      if_region.getLoc(), if_region.getResultTypes(), cond, extern_values,
+      then_name, else_name, if_region.is_stateless());
+  CopyDeviceAndUnderscoredAttributes(if_region, if_op);
   if_region.replaceAllUsesWith(if_op.getResults());
   if_region.erase();
+
+  if (to_bool && to_bool.use_empty()) to_bool.erase();
   return success();
 }
 
@@ -315,8 +337,8 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   // cannot do a trivial transformation because post transform, we will need to
   // pass this extern value as an argument to the function, so we cannot use the
   // existing function as is.
-  auto while_matcher = [](Value first, Region& first_region, Value second,
-                          Region& second_region) {
+  auto while_arg_matcher = [](Value first, Region& first_region, Value second,
+                              Region& second_region) {
     if (!first.isa<BlockArgument>() || !second.isa<BlockArgument>())
       return false;
     BlockArgument first_block_arg = first.cast<BlockArgument>();
@@ -329,8 +351,9 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
            second_block_arg.getParentBlock() == &second_region.front();
   };
 
-  const TrivialTransformInfo tti(while_region.cond(), while_region.body(),
-                                 while_matcher);
+  const TrivialTransformInfo tti(
+      IsSingleCallRegion(while_region.cond(), /*allow_to_bool=*/true),
+      IsSingleCallRegion(while_region.body()), while_arg_matcher);
 
   // All existing inputs to while region are inputs to the functional while.
   auto new_inputs = llvm::to_vector<4>(while_region.getOperands());
@@ -376,7 +399,7 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   auto while_op = builder.create<WhileOp>(
       while_region.getLoc(), new_result_types, new_inputs, cond_name, body_name,
       while_region.parallel_iterations(), while_region.is_stateless());
-  CopyUnderscoredAttributes(while_region, while_op);
+  CopyDeviceAndUnderscoredAttributes(while_region, while_op);
 
   // Redirect old results to new results.
   for (auto it : llvm::zip(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index ef75f90d5c1..d99279c0014 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -438,7 +438,7 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
 
 void ReplicateToIslandPass::runOnOperation() {
   auto module = getOperation();
-  const Dialect* tf_dialect = getContext().getRegisteredDialect("tf");
+  const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
   if (!tf_dialect) {
     module.emitError() << "'tf' dialect is not registered";
     return signalPassFailure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index bd0e8a94a61..c1ca98bf1f1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -26,10 +26,13 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
@@ -39,6 +42,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor_util.h"
+
+#define DEBUG_TYPE "tf-resource-device-inference"
 
 namespace mlir {
 namespace TF {
@@ -132,6 +138,13 @@ inline StringRef GetDeviceAttr(Operation* op) {
   return device_attr ? device_attr.getValue() : "";
 }
 
+// Print operation with debug info (to get line number info for debugging)
+void dump(StringRef message, Operation* op) {
+  llvm::dbgs() << message;
+  op->print(llvm::dbgs(), OpPrintingFlags().enableDebugInfo(true));
+  llvm::dbgs() << "\n";
+}
+
 // Propagates device assignment inside a function.
 LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
                                                   PerFunctionResult* result) {
@@ -153,26 +166,67 @@ LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
     if (failed(res)) return res;
   }
 
-  auto walk_res = func_op.walk([&](Operation* op) {
-    if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
-      // Record VarHandleOp's device attribute.
-      StringRef device_attr = GetDeviceAttr(op);
-      if (device_attr.empty()) return WalkResult::advance();
-      auto res = AddResourceDeviceAndEmitError(var_handle.resource(),
-                                               device_attr, op, result);
-      if (failed(res)) return WalkResult::interrupt();
-    }
-    if (auto identity = dyn_cast<IdentityOp>(op)) {
-      // Try to construct IdentityOp's attribute from recorded assignment.
-      if (!GetDeviceAttr(op).empty()) return WalkResult::advance();
-      for (auto output : filter_resources(op->getResults())) {
-        if (auto device = result->DeviceForResource(output))
-          identity.setAttr(kDeviceAttr, builder.getStringAttr(*device));
-      }
-      return WalkResult::advance();
-    }
-    return WalkResult::advance();
-  });
+  // To support WhileRegion, we need to propagate device attributes from
+  // WhileRegion operands to body/cond region arguments *prior* to visiting
+  // these regions. Use tensorflow::walk() instead of MLIR core walker to
+  // implement such a pre-order walk.
+  auto walk_res = tensorflow::GenericWalk(
+      func_op, [&](Operation* op, const tensorflow::WalkStage& stage) {
+        // We just need to visit operations in pre-order mode.
+        if (!stage.IsBeforeAllRegions()) return WalkResult::advance();
+
+        if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
+          // Record VarHandleOp's device attribute.
+          StringRef device_attr = GetDeviceAttr(op);
+          if (device_attr.empty()) return WalkResult::advance();
+          auto res = AddResourceDeviceAndEmitError(var_handle.resource(),
+                                                   device_attr, op, result);
+          if (failed(res)) return WalkResult::interrupt();
+        } else if (auto identity = dyn_cast<IdentityOp>(op)) {
+          LLVM_DEBUG(dump("Visiting ", identity));
+          // Try to construct IdentityOp's attribute from recorded assignment.
+          if (!GetDeviceAttr(op).empty()) return WalkResult::advance();
+          for (auto output : filter_resources(op->getResults())) {
+            LLVM_DEBUG(llvm::dbgs() << "  Processing output #"
+                                    << output.getResultNumber() << "\n");
+            if (auto device = result->DeviceForResource(output)) {
+              LLVM_DEBUG(llvm::dbgs()
+                         << " Setting device = " << *device << "\n");
+              identity.setAttr(kDeviceAttr, builder.getStringAttr(*device));
+            }
+          }
+        } else if (auto while_region = dyn_cast<WhileRegionOp>(op)) {
+          // For WhileRegion, do local analysis prior to visiting the attached
+          // regions and propagate device annotations to the cond and body
+          // region arguments. The annotations are the union of annotations
+          // on the input and result. Resource alias analysis already propagates
+          // resource ID from the inputs to the results for a while, so just
+          // need to consider the results.
+          LLVM_DEBUG(llvm::dbgs() << "Visiting WhileRegion\n");
+
+          for (auto output : filter_resources(while_region.getResults())) {
+            auto device = result->DeviceForResource(output);
+            int output_index = output.getResultNumber();
+            if (!device) {
+              LLVM_DEBUG(llvm::dbgs()
+                         << "  No device for output #" << output_index << "\n");
+              continue;
+            }
+            // Transfer the annotation to both region arguments
+            for (Region* region : while_region.getRegions()) {
+              BlockArgument arg = region->getArgument(output_index);
+              LLVM_DEBUG(llvm::dbgs()
+                         << "  Propagating device = '" << *device
+                         << "' to arg #" << output_index << " of region #"
+                         << region->getRegionNumber() << "\n");
+              if (failed(AddResourceDeviceAndEmitError(arg, *device,
+                                                       while_region, result)))
+                return WalkResult::interrupt();
+            }
+          }
+        }
+        return WalkResult::advance();
+      });
   return failure(walk_res.wasInterrupted());
 }
 
@@ -201,6 +255,10 @@ void ResourceDeviceInference::runOnOperation() {
             Value arg_operand = caller_operands[arg.getArgNumber()];
             auto device = caller_res.DeviceForResource(arg_operand);
             if (!device) continue;
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Propagating '" << *device << "' to arg #"
+                       << arg.getArgNumber() << " of function @"
+                       << callee.getName() << "\n");
             if (failed(AddResourceDeviceAndEmitError(arg, *device, caller,
                                                      &callee_res,
                                                      &callee_needs_recompute)))
@@ -240,6 +298,8 @@ void ResourceDeviceInference::runOnOperation() {
               "call");
           return WalkResult::interrupt();
         }
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Visiting call to function @" << func.getName() << "\n");
         if (failed(propagate_operands_to_callee_arguments(
                 call, call.getArgOperands(), {func}, func_res)))
           return WalkResult::interrupt();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 702455d156d..77f672f5ee4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -330,15 +331,6 @@ LogicalResult HoistResourceOpsFromCluster(tf_device::ClusterOp cluster,
   getUsedValuesDefinedAbove(new_cluster.body(), new_cluster.body(),
                             captured_values);
 
-  for (Value v : captured_values) {
-    auto tensor_type = v.getType().dyn_cast<TensorType>();
-    if (!tensor_type) continue;
-    if (!tensor_type.getElementType().isa<TF::ResourceType>()) continue;
-
-    return new_cluster.emitOpError()
-           << "has remaining resource inputs that can not be lifted";
-  }
-
   return success();
 }
 
@@ -361,29 +353,23 @@ LogicalResult FindResourceArgUseInfo(
     ResourceArgUseInfo info;
     info.used = false;
     info.updated = false;
-    bool do_not_touch = false;
+    bool read_or_assigned = false;
     for (auto user : arg.getUsers()) {
       if (user == return_op) continue;
+      info.used = true;
       if (auto read = llvm::dyn_cast<TF::ReadVariableOp>(user)) {
-        info.used = true;
+        read_or_assigned = true;
         info.data_type = read.getType();
         continue;
       }
       if (auto assign = llvm::dyn_cast<TF::AssignVariableOp>(user)) {
-        info.used = true;
+        read_or_assigned = true;
         info.updated = true;
         info.data_type = assign.value().getType();
         continue;
       }
-      if (isa<TF::StackPushV2Op, TF::StackPopV2Op>(user)) {
-        // Stacks will be handled by a separate pass.
-        do_not_touch = true;
-        break;
-      }
-      user->emitOpError("found unsupported operations on resource.");
-      return failure();
     }
-    if (!do_not_touch) (*result)[arg.getArgNumber()] = info;
+    if (!info.used || read_or_assigned) (*result)[arg.getArgNumber()] = info;
   }
   return success();
 }
@@ -914,8 +900,8 @@ LogicalResult HandlePartitionedCallOpCallee(
 // resource-lifted new callee function in lifting_info.
 template <typename CallOpType>
 void UpdatePartitionedCallOpWithNewCallee(
-    CallOpType call_op, const PartitionedCallLiftingInfo& lifting_info) {
-  if (lifting_info.lifted_callee == nullptr) return;
+    CallOpType call_op, PartitionedCallLiftingInfo& lifting_info) {
+  if (!lifting_info.lifted_callee) return;
   // Replace output resource uses with the aliasing input, so that we can remove
   // this output.
   for (const auto& entry : lifting_info.old_outputs_aliasing_old_inputs) {
@@ -929,12 +915,10 @@ void UpdatePartitionedCallOpWithNewCallee(
   auto new_operands =
       FilterRange<Value, OperandRange>(call_op.args(), lifting_info.use_info);
   auto new_call = builder.create<CallOpType>(
-      call_op.getLoc(),
-      const_cast<FuncOp&>(lifting_info.lifted_callee).getType().getResults(),
+      call_op.getLoc(), lifting_info.lifted_callee.getType().getResults(),
       new_operands, call_op.getAttrs());
   new_call.setAttr(
-      "f", builder.getSymbolRefAttr(
-               const_cast<FuncOp&>(lifting_info.lifted_callee).getName()));
+      "f", builder.getSymbolRefAttr(lifting_info.lifted_callee.getName()));
   AddLoadsStoresOutsideControlFlowOp(
       new_call, lifting_info.arg_data_type_and_updated_output_index);
   // Replace uses.
@@ -949,7 +933,8 @@ void UpdatePartitionedCallOpWithNewCallee(
 }
 
 LogicalResult HoistForFunctionalControlFlow(
-    Block*, ModuleOp, llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>*);
+    Block*, ModuleOp,
+    llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*);
 
 // A templated routine for handling both PartitionedCallOp and
 // StatefulPartitionedCallOp. If the callee is already lifted, it just updates
@@ -958,9 +943,10 @@ LogicalResult HoistForFunctionalControlFlow(
 template <typename CallOpType>
 LogicalResult HandlePartitionedCallOp(
     CallOpType call_op, FuncOp callee, ModuleOp module,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>* lifted_callees) {
-  auto emplace_res =
-      lifted_callees->try_emplace(callee, PartitionedCallLiftingInfo());
+    llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
+        lifted_callees) {
+  auto emplace_res = lifted_callees->try_emplace(callee.getName(),
+                                                 PartitionedCallLiftingInfo());
   if (emplace_res.second) {
     // Unseen callee. Perform resource lifting on it.
     HoistForFunctionalControlFlow(&callee.front(), module, lifted_callees);
@@ -977,7 +963,7 @@ LogicalResult HandlePartitionedCallOp(
 // body/cond/branch/callee functions.
 LogicalResult HoistForFunctionalControlFlow(
     Block* block, ModuleOp module,
-    llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>*
+    llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
         lifted_partitioned_call_callees) {
   // Remove identity nodes to avoid aliasing.
   RemoveIdentity(block);
@@ -1056,7 +1042,7 @@ LogicalResult HoistForFunctionalControlFlow(
 // Returns failure if there are remaining resource-type values that can not be
 // lifted.
 void ResourceOpLiftingPass::runOnOperation() {
-  llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
+  llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
   ModuleOp module = getOperation();
   auto result = module.walk([&](FuncOp func_op) {
@@ -1121,7 +1107,7 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
            << function.getBlocks().size();
   }
 
-  llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
+  llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
   return HoistForFunctionalControlFlow(&function.front(),
                                        cast<ModuleOp>(function.getParentOp()),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 597fbe2c0b1..88ad787df3e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/FoldInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -596,7 +597,7 @@ ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context,
                                bool propagate_caller_callee_constants)
     : graph_version_(graph_version),
       propagate_caller_callee_constants_(propagate_caller_callee_constants) {
-  tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
+  tf_dialect_ = context->getLoadedDialect<TensorFlowDialect>();
 }
 
 ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
@@ -697,11 +698,8 @@ bool ShapeInference::RefineShapeForPassThroughOps(Operation* op) {
     // TODO(jpienaar): The tf.Cast op, which is uniformly inserted at the
     // moment, cannot handle arbirary types (e.g., it can't handle quantized
     // types). This restriction can be relaxed if not only tf.Cast is used.
-    auto kind = t.getKind();
-    return (kind >= Type::FIRST_STANDARD_TYPE &&
-            kind < Type::LAST_STANDARD_TYPE) ||
-           (kind >= Type::FIRST_TENSORFLOW_TYPE &&
-            kind < Type::LAST_TENSORFLOW_TYPE);
+    return t.getDialect().getNamespace().empty() ||
+           isa<TensorFlowDialect>(t.getDialect());
   };
 
   bool changed = false;
@@ -1174,10 +1172,11 @@ LogicalResult ShapeInference::TryToFold(Operation* op) {
     if (!dialect) return failure();
     // Only attempt TF dialect fallback if there are no unknown operands.
     if (some_unknown && dialect == tf_dialect_) return failure();
-    SmallVector<Attribute, 8> constants;
-    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
+    auto* interface = dialect->getRegisteredInterface<DialectFoldInterface>();
+    if (!interface) return failure();
+
+    if (failed(interface->fold(op, constant_operands, fold_results)))
       return failure();
-    fold_results.assign(constants.begin(), constants.end());
   }
 
   for (auto result : zip(op->getResults(), fold_results)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
new file mode 100644
index 00000000000..f14efeb91ce
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Deletes the op and forwards the arguments.
+template <typename TF_Op>
+class PassThroughConversion : public mlir::OpConversionPattern<TF_Op> {
+ public:
+  explicit PassThroughConversion(MLIRContext *context)
+      : mlir::OpConversionPattern<TF_Op>(context) {}
+
+  LogicalResult matchAndRewrite(
+      TF_Op op, ArrayRef<mlir::Value> operands,
+      ConversionPatternRewriter &rewriter) const override {  // NOLINT
+    // Just forward the arguments to results.
+    rewriter.replaceOp(op, operands);
+    return success();
+  }
+};
+
+class TensorDeviceCopyConversionPass
+    : public PassWrapper<TensorDeviceCopyConversionPass, FunctionPass> {
+ public:
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::ConversionTarget target(getContext());
+
+    // TODO(tfrt-devs): when device placer is introduced in the lowering pass,
+    // we need to check if Identity op and it's previous op are placed on the
+    // same device. If not, we don't fold Identity op since it's used for tensor
+    // copying between devices.
+    patterns.insert<PassThroughConversion<TF::IdentityOp>,
+                    PassThroughConversion<TF::IdentityNOp>>(&getContext());
+
+    if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<mlir::FuncOp>>
+CreateTensorDeviceCopyConversionPass() {
+  return std::make_unique<TensorDeviceCopyConversionPass>();
+}
+
+static mlir::PassRegistration<TensorDeviceCopyConversionPass>
+    tensor_device_copy_pass(
+        "tf-tensor-device-copy",
+        "Handle ops that copy tensors between devices. E.g., tf.Identity.");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
index 2a770b2615d..f26887eb276 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
@@ -34,7 +34,7 @@ class SimpleTFDeviceAssignmentPass
 
   void runOnFunction() override {
     Builder builder(&getContext());
-    Dialect* tf = getContext().getRegisteredDialect<TensorFlowDialect>();
+    Dialect* tf = getContext().getLoadedDialect<TensorFlowDialect>();
     getFunction().walk([&](Operation* op) {
       if (auto device_attr = op->getAttrOfType<StringAttr>("device")) {
         // We assign default device to ops with device attribute that is empty.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 2be6ee7a78c..fed4002bfcf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -331,7 +331,8 @@ void RemoveClusterAliasedOutputs(OpBuilder* builder,
   for (auto result :
        llvm::zip(cluster_terminator->getOperands(), cluster.getResults())) {
     Value cluster_terminator_operand = std::get<0>(result);
-    if (cluster.getOperation()->isProperAncestor(
+    if (cluster_terminator_operand.getDefiningOp() &&
+        cluster.getOperation()->isProperAncestor(
             cluster_terminator_operand.getDefiningOp())) {
       new_cluster_results.push_back(cluster_terminator_operand);
       new_cluster_result_types.push_back(cluster_terminator_operand.getType());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 8adafe05cd3..b141a7dc792 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -314,21 +314,41 @@ tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
   return launch_op;
 }
 
-// Extracts all externally provided operands of `cluster_ops`.
+// Extracts all externally provided operands of `host_cluster_ops`.
 llvm::SmallSetVector<Value, 4> GetExternalOperands(
-    llvm::ArrayRef<Operation*> cluster_ops) {
+    tf_device::ClusterOp tpu_cluster,
+    llvm::ArrayRef<Operation*> host_cluster_ops) {
   llvm::SmallSetVector<Value, 4> external_values;
 
-  for (Operation* op : cluster_ops) {
-    for (Value v : op->getOperands()) {
-      Operation* defining_op = v.getDefiningOp();
-      if (!defining_op) continue;
-      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
-        return defining_op == cluster_op;
-      });
+  for (Operation* host_cluster_op : host_cluster_ops) {
+    auto cluster_op_parent_region = host_cluster_op->getParentRegion();
+    host_cluster_op->walk([&](Operation* op) {
+      auto region = op->getParentRegion();
 
-      if (is_external) external_values.insert(v);
-    }
+      if (region == cluster_op_parent_region) {
+        // For op operands, add operand defining ops, if they are not included
+        // in `host_cluster_ops`.
+        for (Value v : op->getOperands()) {
+          Operation* defining_op = v.getDefiningOp();
+          if (!defining_op) continue;
+          bool is_external = llvm::none_of(
+              host_cluster_ops,
+              [&](Operation* cluster_op) { return defining_op == cluster_op; });
+
+          if (is_external) external_values.insert(v);
+        }
+      } else {
+        llvm::SetVector<Value> external_captured_inputs;
+        visitUsedValuesDefinedAbove(*region, *region, [&](OpOperand* operand) {
+          Region* parent_region = operand->get().getParentRegion();
+          if (!tpu_cluster.body().isAncestor(parent_region)) return;
+
+          external_captured_inputs.insert(operand->get());
+        });
+        external_values.insert(external_captured_inputs.begin(),
+                               external_captured_inputs.end());
+      }
+    });
   }
 
   return external_values;
@@ -494,7 +514,7 @@ void CreateParallelExecuteFromOutsideClusters(ModuleOp module,
         &builder, cluster_ops.back(), host_device);
 
     // Determine if there are any inputs that are provided out of cluster.
-    auto external_inputs = GetExternalOperands(cluster_ops);
+    auto external_inputs = GetExternalOperands(tpu_cluster, cluster_ops);
     auto external_outputs = GetExternalOutputs(cluster_ops);
 
     MoveOutsideCompiledOps(module, tpu_cluster, cluster.value().getFirst(),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_identity_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_identity_pruning.cc
new file mode 100644
index 00000000000..32b1eb340d6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_identity_pruning.cc
@@ -0,0 +1,113 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <tuple>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+// This pass removes Identity/IdentityN ops from the TPU computation and
+// reachable functions.
+// TODO(lyandy): Remove this pass once resource op lifting is migrated to use
+// resource alias analysis and support region based control flow. Removing
+// Identity ops may remove `_XlaSharding` annotation attribute if Identity ops
+// are used to propagate such information.
+
+struct TPUIdentityPruning
+    : public PassWrapper<TPUIdentityPruning, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Collects all reachable functions (via call ops) from a given region.
+SmallVector<FuncOp, 4> CollectReachableFunctions(Region& region) {
+  llvm::SmallPtrSet<FuncOp, 4> reachable_funcs;
+
+  auto collect_reachable_funcs =
+      [&reachable_funcs](Region& src, SmallVectorImpl<FuncOp>& funcs_to_visit) {
+        src.walk([&reachable_funcs, &funcs_to_visit](CallOpInterface call_op) {
+          auto func = dyn_cast_or_null<FuncOp>(call_op.resolveCallable());
+          if (func && reachable_funcs.insert(func).second)
+            funcs_to_visit.push_back(func);
+        });
+      };
+
+  SmallVector<FuncOp, 4> funcs_to_visit;
+  collect_reachable_funcs(region, funcs_to_visit);
+
+  while (!funcs_to_visit.empty()) {
+    SmallVector<FuncOp, 4> new_funcs_to_visit;
+    for (FuncOp func_to_visit : funcs_to_visit) {
+      if (!func_to_visit.getCallableRegion()) continue;
+      collect_reachable_funcs(*func_to_visit.getCallableRegion(),
+                              new_funcs_to_visit);
+    }
+    funcs_to_visit.swap(new_funcs_to_visit);
+  }
+
+  return llvm::to_vector<4>(reachable_funcs);
+}
+
+// Removes Identity/IdentityN ops from a region and forwards its operands to its
+// results.
+void RemoveIdentityFromRegion(Region& region) {
+  region.walk([](Operation* op) {
+    if (isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
+      op->replaceAllUsesWith(op->getOperands());
+      op->erase();
+    }
+  });
+}
+
+void TPUIdentityPruning::runOnOperation() {
+  SmallVector<tf_device::ClusterOp, 4> clusters;
+  getOperation().walk(
+      [&](tf_device::ClusterOp cluster) { clusters.push_back(cluster); });
+
+  for (tf_device::ClusterOp cluster : clusters) {
+    RemoveIdentityFromRegion(cluster.body());
+    auto reachable_funcs = CollectReachableFunctions(cluster.body());
+    for (FuncOp reachable_func : reachable_funcs)
+      RemoveIdentityFromRegion(*reachable_func.getCallableRegion());
+  }
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass() {
+  return std::make_unique<TPUIdentityPruning>();
+}
+
+static PassRegistration<TPUIdentityPruning> pass(
+    "tf-tpu-identity-pruning",
+    "Removes Identity/IdentityN ops from the TPU computation");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index ca77feafc05..21ad457a7a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -409,12 +409,15 @@ Operation* BuildCompileOp(
   std::string txt_module;
   if (failed(EncapsulateFuncAndSerialize(func, &txt_module))) return nullptr;
 
-  auto result_type =
+  auto compilation_status_type =
       RankedTensorType::get({}, builder->getType<TF::StringType>());
+  auto program_type =
+      RankedTensorType::get({2}, builder->getType<TF::StringType>());
 
   auto compile_op = builder->create<TF::_TPUCompileMlirOp>(
-      cluster_func.getLoc(), /*compilation_status=*/result_type, /*program=*/
-      llvm::SmallVector<Type, 8>(num_cores_per_replica, result_type),
+      cluster_func.getLoc(),
+      /*compilation_status=*/compilation_status_type, /*program=*/
+      llvm::SmallVector<Type, 8>(num_cores_per_replica, program_type),
       compile_op_operands, txt_module, txt_metadata);
 
   return WrapOpInLaunch(builder, compile_op.getLoc(), compile_op,
@@ -598,9 +601,9 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 // func @main(%arg0: tensor<i1>) {
 //   %0 = "tf.Shape"(%arg0) : (tensor<i1>) -> tensor<?xi32>
 //   %1:2 = "tf._TPUCompileMlir"(%0) {device = "/CPU:0"} :
-//            (tensor<?xi32>) -> (tensor<!tf.string>, tensor<!tf.string>)
+//            (tensor<?xi32>) -> (tensor<!tf.string>, tensor<2x!tf.string>)
 //   %2 = "tf.TPUExecute"(%arg0, %1#0) {device = "/TPU:0"} :
-//            (tensor<i1>, tensor<!tf.string>) -> tensor<i1>
+//            (tensor<i1>, tensor<2x!tf.string>) -> tensor<i1>
 //   return
 // }
 //
@@ -624,9 +627,9 @@ void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
 //                              {n = 2 : i32, devices = ["/TPU:0", "/TPU:1"]} {
 //     %1 = "tf.Shape"(%ri) : (tensor<i1>) -> tensor<?xi32>
 //     %2:2 = "tf._TPUCompileMlir"(%1) {device = "/CPU:0"} :
-//              (tensor<?xi32>) -> (tensor<!tf.string>, tensor<!tf.string>)
+//              (tensor<?xi32>) -> (tensor<!tf.string>, tensor<2x!tf.string>)
 //     %3 = "tf.TPUExecute"(%ri, %2#0) :
-//            (tensor<i1>, tensor<!tf.string>) -> tensor<i1>
+//            (tensor<i1>, tensor<2x!tf.string>) -> tensor<i1>
 //     tf_device.return %3 : tensor<i1>
 //   }
 //   return
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 571d5e3e715..631553b381e 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -726,7 +726,7 @@ Status Exporter::Convert(mlir::ModuleOp module,
       mlir::Identifier::get("main", module.getContext());
   absl::optional<mlir::FuncOp> entry_func;
   FunctionDefLibrary flib;
-  auto tf_dialect = module.getContext()->getRegisteredDialect("tf");
+  auto tf_dialect = module.getContext()->getLoadedDialect("tf");
   for (auto function : module.getOps<mlir::FuncOp>()) {
     if (function.isExternal())
       return errors::FailedPrecondition("External functions not supported");
@@ -799,7 +799,7 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
 stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
     mlir::FuncOp func, const GraphExportConfig& configs,
     FunctionDef* function_def) {
-  Dialect* tf_dialect = func.getContext()->getRegisteredDialect("tf");
+  Dialect* tf_dialect = func.getContext()->getLoadedDialect("tf");
   FunctionDefLibrary flib;
   TF_RETURN_IF_ERROR(
       Exporter::ConvertLibFunction(configs, tf_dialect, func, &flib));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 94ddf76736e..692d0eaf962 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -177,7 +177,8 @@ Status UpgradeLegacyGraph(Graph* graph, FunctionLibraryDefinition* flib_def,
       restrict_functionalization_to_tpu_nodes
           ? [](const Node* n) { return n->attrs().Find(kTpuReplicateAttr); }
           : NodeFilter{};
-  return FunctionalizeControlFlow(graph, flib_def, node_filter);
+  return FunctionalizeControlFlow(graph, flib_def, node_filter,
+                                  /*include_functions=*/true);
 }
 
 // Stateful helper class to import a TensorFlow model into an MLIR Module.
@@ -2135,6 +2136,11 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
     mlir::MLIRContext* context, const Graph& graph,
     const GraphDebugInfo& debug_info, const FunctionLibraryDefinition& flib_def,
     const GraphImportConfig& specs, llvm::StringRef func_name) {
+  // Load dialects involved in the conversion
+  context->loadDialect<mlir::StandardOpsDialect>();
+  context->loadDialect<mlir::TF::TensorFlowDialect>();
+  context->loadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+
   mlir::OwningModuleRef module =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
   std::unordered_map<std::string, std::string> tf_name_to_mlir_name;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 1c7988d3a40..58377661a23 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -219,22 +219,18 @@ StatusOr<mlir::OwningModuleRef> GraphdefToSplattedMlirTranslateFunction(
         if (auto attr = inst.getAttrOfType<mlir::ElementsAttr>(attr_id)) {
           mlir::Attribute rand_val;
           mlir::Type element_type = attr.getType().getElementType();
+          if (element_type.isa<mlir::IntegerType>()) {
+            rand_val = mlir::IntegerAttr::get(element_type, std::rand());
+          } else if (element_type.isF16() || element_type.isF32() ||
+                     element_type.isF64()) {
+            rand_val = mlir::FloatAttr::get(element_type,
+                                            std::rand() * 1.0 / RAND_MAX);
 
-          switch (element_type.getKind()) {
-            case mlir::StandardTypes::Integer:
-              rand_val = mlir::IntegerAttr::get(element_type, std::rand());
-              break;
-            case mlir::StandardTypes::F16:
-            case mlir::StandardTypes::F32:
-            case mlir::StandardTypes::F64:
-              rand_val = mlir::FloatAttr::get(element_type,
-                                              std::rand() * 1.0 / RAND_MAX);
-              break;
-            default:
-              inst.emitWarning()
-                  << "Skipping splat conversion for "
-                  << "an unsupported attribute type " << element_type;
-              continue;
+          } else {
+            inst.emitWarning()
+                << "Skipping splat conversion for "
+                << "an unsupported attribute type " << element_type;
+            continue;
           }
           auto new_attr =
               mlir::DenseElementsAttr::get(attr.getType(), rand_val);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
similarity index 66%
rename from tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h
rename to tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index 599a8df63d7..bd81cae5730 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -36,7 +36,18 @@ inline void CopyUnderscoredAttributes(Operation *from, Operation *to) {
   });
 }
 
+// Copies attributes that are either `device` or whose name begins with an _
+// from `from` to `to`.
+// TODO(b/158769932): This should be a general feature instead post some policy
+// discussion.
+inline void CopyDeviceAndUnderscoredAttributes(Operation *from, Operation *to) {
+  auto device = mlir::Identifier::get("device", from->getContext());
+  CopyAttributes(from, to, [&device](const NamedAttribute &attr) {
+    return attr.first.strref().front() == '_' || attr.first == device;
+  });
+}
+
 }  // namespace TF
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_ATTRIBUTE_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 99a5e32adc2..f7a9823a1a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -420,6 +420,7 @@ Status CompileSerializedMlirToXlaHlo(
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   RegisterDialects();
   mlir::MLIRContext mlir_context;
+  mlir_context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef mlir_module;
 
   TF_RETURN_IF_ERROR(
@@ -509,6 +510,7 @@ Status CompileGraphToXlaHlo(
   RegisterDialects();
 
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   GraphImportConfig config;
   config.graph_as_function = true;
   // Disable shape inference during import as some TensorFlow op fails during
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index 359314a64b0..05e1f059029 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
@@ -161,7 +161,7 @@ StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
     default:
       // TODO(shpeisman): restructure code to reuse dialect pointer across
       // calls.
-      auto* dialect = builder->getContext()->getRegisteredDialect("tf");
+      auto* dialect = builder->getContext()->getLoadedDialect("tf");
       return OpaqueElementsAttr::get(dialect, type, MangleTensor(input_tensor));
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index bf96e3d1df4..4917d73ba2a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -43,6 +43,7 @@ static void RegisterDialects() {
 
 TEST(ConvertTypeToTensorTypeTest, UnrankedTensorType) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::Builder b(&context);
 
   PartialTensorShape output_shape =
@@ -52,6 +53,7 @@ TEST(ConvertTypeToTensorTypeTest, UnrankedTensorType) {
 
 TEST(ConvertTypeToTensorTypeTest, NonFullyDefinedRankedTensorType) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::Builder b(&context);
 
   PartialTensorShape output_shape = ConvertTypeToTensorShape(
@@ -61,6 +63,7 @@ TEST(ConvertTypeToTensorTypeTest, NonFullyDefinedRankedTensorType) {
 
 TEST(ConvertTypeToTensorTypeTest, FullyDefinedRankedTensorType) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::Builder b(&context);
 
   PartialTensorShape output_shape = ConvertTypeToTensorShape(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index 0caceb69510..0d035e8f864 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -91,64 +91,62 @@ Status ConvertDataType(DataType dtype, Builder builder, Type* type) {
 }
 
 Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
-  switch (type.getKind()) {
-    case mlir::StandardTypes::F16:
-      *dtype = DT_HALF;
-      return Status::OK();
-    case mlir::StandardTypes::F32:
-      *dtype = DT_FLOAT;
-      return Status::OK();
-    case mlir::StandardTypes::F64:
-      *dtype = DT_DOUBLE;
-      return Status::OK();
-    case mlir::StandardTypes::BF16:
-      *dtype = DT_BFLOAT16;
-      return Status::OK();
-    case mlir::StandardTypes::Integer: {
-      const auto& itype = type.cast<mlir::IntegerType>();
-      switch (itype.getWidth()) {
-        case 1:
-          *dtype = DT_BOOL;
-          return Status::OK();
-        case 8:
-          *dtype = itype.isUnsigned() ? DT_UINT8 : DT_INT8;
-          return Status::OK();
-        case 16:
-          *dtype = itype.isUnsigned() ? DT_UINT16 : DT_INT16;
-          return Status::OK();
-        case 32:
-          *dtype = itype.isUnsigned() ? DT_UINT32 : DT_INT32;
-          return Status::OK();
-        case 64:
-          *dtype = itype.isUnsigned() ? DT_UINT64 : DT_INT64;
-          return Status::OK();
-        default:
-          return errors::Unimplemented(
-              absl::StrCat("Converting ", debugString(type), " to DataType"));
-      }
-    }
-    case mlir::StandardTypes::Complex: {
-      auto etype = type.cast<mlir::ComplexType>().getElementType();
-      if (etype.isF32()) {
-        *dtype = DT_COMPLEX64;
-        return Status::OK();
-      } else if (etype.isF64()) {
-        *dtype = DT_COMPLEX128;
-        return Status::OK();
-      }
-      return errors::Unimplemented(
-          absl::StrCat("Converting ", debugString(type), " to DataType"));
-    }
-#define HANDLE_TF_TYPE(tftype, enumerant, name) \
-  case mlir::TF::TensorFlowTypes::enumerant:    \
-    *dtype = DT_##enumerant;                    \
+  if (type.isF16()) {
+    *dtype = DT_HALF;
     return Status::OK();
+  } else if (type.isF32()) {
+    *dtype = DT_FLOAT;
+    return Status::OK();
+  } else if (type.isF64()) {
+    *dtype = DT_DOUBLE;
+    return Status::OK();
+  } else if (type.isBF16()) {
+    *dtype = DT_BFLOAT16;
+    return Status::OK();
+  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    switch (itype.getWidth()) {
+      case 1:
+        *dtype = DT_BOOL;
+        return Status::OK();
+      case 8:
+        *dtype = itype.isUnsigned() ? DT_UINT8 : DT_INT8;
+        return Status::OK();
+      case 16:
+        *dtype = itype.isUnsigned() ? DT_UINT16 : DT_INT16;
+        return Status::OK();
+      case 32:
+        *dtype = itype.isUnsigned() ? DT_UINT32 : DT_INT32;
+        return Status::OK();
+      case 64:
+        *dtype = itype.isUnsigned() ? DT_UINT64 : DT_INT64;
+        return Status::OK();
+      default:
+        return errors::Unimplemented(
+            absl::StrCat("Converting ", debugString(type), " to DataType"));
+    }
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    auto etype = complex_type.getElementType();
+    if (etype.isF32()) {
+      *dtype = DT_COMPLEX64;
+      return Status::OK();
+    } else if (etype.isF64()) {
+      *dtype = DT_COMPLEX128;
+      return Status::OK();
+    }
+    return errors::Unimplemented(
+        absl::StrCat("Converting ", debugString(type), " to DataType"));
+  }
+
+#define HANDLE_TF_TYPE(tftype, enumerant, name) \
+  if (type.isa<mlir::TF::tftype##Type>()) {     \
+    *dtype = DT_##enumerant;                    \
+    return Status::OK();                        \
+  }
 // NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
-    default:
-      return errors::Unimplemented(
-          absl::StrCat("Converting ", debugString(type), " to DataType"));
-  }
+
+  return errors::Unimplemented(
+      absl::StrCat("Converting ", debugString(type), " to DataType"));
 }
 
 Status ConvertToDataType(Type type, DataType* dtype) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
index 07f6b129a41..5b791752eb0 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
@@ -36,6 +36,7 @@ std::string ConvertToMlirString(const std::vector<int64_t>& dims,
   }
   mlir::MLIRContext context;
   mlir::Builder b(&context);
+  context.loadAllGloballyRegisteredDialects();
   auto status_or = ConvertToMlirTensorType(shape, dtype, &b);
   std::string buf;
   llvm::raw_string_ostream os(buf);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index 1da1f5973f6..e41b62ddccd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -60,6 +60,7 @@ class FakeDevice : public Device {
 
 TEST(DeviceUtilTest, AddDeviceToOp) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
 
@@ -101,6 +102,7 @@ TEST(DeviceUtilTest, AddDeviceToOp) {
 
 TEST(DeviceUtilTest, AddDeviceToOpNullDeviceSet) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
 
@@ -110,6 +112,7 @@ TEST(DeviceUtilTest, AddDeviceToOpNullDeviceSet) {
 
 TEST(DeviceUtilTest, GetDevicesFromOpNoDevicesAttribute) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
index c77107c8de7..4fcf036b160 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
@@ -66,6 +66,7 @@ Status DumpTextualIRToFile(const MlirDumpConfig& config, const Graph& graph,
                            WritableFile* file) {
   WritableFileRawStream os(std::move(file));
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module;
   if (flib_def) {
     flib_def = &graph.flib_def();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index c0d109f7569..dee499605e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -28,6 +28,7 @@ namespace {
 
 TEST(DumpMlirModuleTest, NoEnvPrefix) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   unsetenv("TF_DUMP_GRAPH_PREFIX");
@@ -38,6 +39,7 @@ TEST(DumpMlirModuleTest, NoEnvPrefix) {
 
 TEST(DumpMlirModuleTest, LogInfo) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   setenv("TF_DUMP_GRAPH_PREFIX", "-", 1);
@@ -48,6 +50,7 @@ TEST(DumpMlirModuleTest, LogInfo) {
 
 TEST(DumpMlirModuleTest, Valid) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index b174ad40a3b..832bc04fdaa 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -29,6 +29,7 @@ using testing::HasSubstr;
 
 TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
   MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   auto id = Identifier::get("test.cc", &context);
   auto loc = FileLineColLoc::get(id, 0, 0, &context);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index ad9ddb277d7..67c2aebf121 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -368,65 +369,36 @@ Status ConvertAttributes(
       name = mangling_util::DemangleAttributeName(name);
     }
     AttrValue value;
-    switch (attr.getKind()) {
-      case mlir::StandardAttributes::SymbolRef: {
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::FlatSymbolRefAttr>(), &value));
-        func_call_attrs[string(name)] = value;
-        continue;
-      }
-      case mlir::StandardAttributes::Integer:
-        if (auto boolAttr = attr.dyn_cast<mlir::BoolAttr>()) {
-          TF_RETURN_IF_ERROR(ConvertAttribute(boolAttr, &value));
-        } else {
-          TF_RETURN_IF_ERROR(
-              ConvertAttribute(attr.cast<mlir::IntegerAttr>(), &value));
-        }
-        break;
-      case mlir::StandardAttributes::Float:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::FloatAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::String:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::StringAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::Array:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::ArrayAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::DenseIntOrFPElements:
-      case mlir::StandardAttributes::DenseStringElements:
-      case mlir::StandardAttributes::OpaqueElements:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::ElementsAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::Type:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::TypeAttr>(), &value));
-        break;
-      case mlir::StandardAttributes::Unit:
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::UnitAttr>(), &value));
-        break;
-      case static_cast<unsigned>(mlir::TF::AttrKind::SHAPE):
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::TF::ShapeAttr>(), &value));
-        break;
-      case static_cast<unsigned>(mlir::TF::AttrKind::FUNC): {
-        TF_RETURN_IF_ERROR(
-            ConvertAttribute(attr.cast<mlir::TF::FuncAttr>(), &value));
-        func_call_attrs[string(name)] = value;
-        continue;
-      }
-      // AffineMap kind is not implemented.
-      case mlir::StandardAttributes::AffineMap:
-        return errors::Unimplemented("AffineMap attribute (needed for '",
-                                     name_strref, "') unimplemented");
-      default:
-        return errors::Unimplemented("Unhandled attribute kind for attribute '",
-                                     name_strref, '\'');
+    if (auto symbol_ref = attr.dyn_cast<mlir::SymbolRefAttr>()) {
+      TF_RETURN_IF_ERROR(
+          ConvertAttribute(symbol_ref.cast<mlir::FlatSymbolRefAttr>(), &value));
+      func_call_attrs[string(name)] = value;
+      continue;
     }
+    if (auto func_attr = attr.dyn_cast<mlir::TF::FuncAttr>()) {
+      TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, &value));
+      func_call_attrs[string(name)] = value;
+      continue;
+    }
+    if (attr.isa<mlir::AffineMapAttr>()) {
+      // AffineMapAttr is not implemented.
+      return errors::Unimplemented("AffineMap attribute (needed for '",
+                                   name_strref, "') unimplemented");
+    }
+    TF_RETURN_IF_ERROR(
+        llvm::TypeSwitch<mlir::Attribute, Status>(attr)
+            .Case<mlir::BoolAttr, mlir::IntegerAttr, mlir::FloatAttr,
+                  mlir::StringAttr, mlir::ArrayAttr, mlir::ElementsAttr,
+                  mlir::TypeAttr, mlir::UnitAttr, mlir::TF::ShapeAttr>(
+                [&](auto derived_attr) {
+                  return ConvertAttribute(derived_attr, &value);
+                })
+            .Default([&](mlir::Attribute) {
+              return errors::Unimplemented(
+                  "Unhandled attribute kind for attribute '", name_strref,
+                  '\'');
+            }));
+
     // According to the NodeDef proto definition, an attribute name from the
     // input TensorFlow GraphDef shouldn't contain '.'. If it does appear in
     // the attribute from MLIR, it is treated as an attribute from function
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index b23fbe7d73c..fc206ca08f9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -602,6 +602,7 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
 
 TEST(TPURewriteDeviceUtilTest, TestGetDeviceCoordinates) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::Builder builder(&context);
   auto device_assignment_attr = builder.getI64ArrayAttr({1, 2, 3});
   auto status_or_device_coodinates =
@@ -615,6 +616,7 @@ TEST(TPURewriteDeviceUtilTest, TestGetDeviceCoordinates) {
 
 TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::Builder builder(&context);
   auto device_assignment_attr = builder.getF32ArrayAttr({1.0, 2.0, 3.0});
   auto status_or_device_coodinates =
@@ -627,6 +629,7 @@ TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
 TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
   mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 1416ac038d6..144e22750ca 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -17,77 +17,36 @@ limitations under the License.
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/AsmState.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "mlir/Support/MlirOptMain.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> input_filename(llvm::cl::Positional,
-                                                 llvm::cl::desc("<input file>"),
-                                                 llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<std::string> output_filename(
-    "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"),
-    llvm::cl::init("-"));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> split_input_file(
-    "split-input-file",
-    llvm::cl::desc("Split the input file into pieces and process each "
-                   "chunk independently"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verify_diagnostics(
-    "verify-diagnostics",
-    llvm::cl::desc("Check that emitted diagnostics match "
-                   "expected-* lines on the corresponding line"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> verify_passes(
-    "verify-each",
-    llvm::cl::desc("Run the verifier after each transformation pass"),
-    llvm::cl::init(true));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> allowUnregisteredDialects(
-    "allow-unregistered-dialect",
-    llvm::cl::desc("Allow operation with no registered dialects"),
-    llvm::cl::init(false));
-
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
-  // Register various MLIR command line options.
-  mlir::registerAsmPrinterCLOptions();
-  mlir::registerMLIRContextCLOptions();
-  mlir::registerPassManagerCLOptions();
+  mlir::registerAllPasses();
 
-  // Parse pass names in main to ensure static initialization completed.
-  mlir::PassPipelineCLParser pass_pipeline("", "Compiler passes to run");
-
-  llvm::cl::ParseCommandLineOptions(argc, argv,
-                                    "TF MLIR modular optimizer driver\n");
-
-  // Set up the input file.
-  std::string error_message;
-  auto file = mlir::openInputFile(input_filename, &error_message);
-  QCHECK(file) << error_message;
-
-  auto output = mlir::openOutputFile(output_filename, &error_message);
-  QCHECK(output) << error_message;
-
-  if (failed(mlir::MlirOptMain(output->os(), std::move(file), pass_pipeline,
-                               split_input_file, verify_diagnostics,
-                               verify_passes, allowUnregisteredDialects)))
-    return 1;
-  output->keep();
-  return 0;
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  registry.insert<mlir::shape::ShapeDialect>();
+  registry.insert<mlir::TF::TensorFlowDialect>();
+  registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  registry.insert<mlir::tf_device::TensorFlowDeviceDialect>();
+  registry.insert<mlir::tf_executor::TensorFlowExecutorDialect>();
+  registry.insert<mlir::tf_saved_model::TensorFlowSavedModelDialect>();
+  return failed(
+      mlir::MlirOptMain(argc, argv, "TensorFlow pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index caac8ea1eeb..9b0b3aaa82b 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -111,6 +111,7 @@ int main(int argc, char** argv) {
 
   if (import_saved_model_object_graph) {
     mlir::MLIRContext context;
+    context.loadAllGloballyRegisteredDialects();
 
     auto module_or = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, exported_names, &context);
@@ -119,6 +120,7 @@ int main(int argc, char** argv) {
     module_or.ConsumeValueOrDie()->print(output->os());
   } else if (import_saved_model_signature_defs) {
     mlir::MLIRContext context;
+    context.loadAllGloballyRegisteredDialects();
 
     auto module_or = tensorflow::SavedModelSignatureDefsToMlirImport(
         input_filename, tags, exported_names, &context, upgrade_legacy);
@@ -139,6 +141,7 @@ int main(int argc, char** argv) {
       llvm::SourceMgr sourceMgr;
       sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
       mlir::MLIRContext context;
+      context.loadAllGloballyRegisteredDialects();
       mlir::SourceMgrDiagnosticHandler diagnostic_handler(sourceMgr, &context);
       return (*requested_translation)(sourceMgr, os, &context);
     };
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
index e735a3c7b8c..915fb91a8df 100644
--- a/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
@@ -125,6 +125,7 @@ int main(int argc, char** argv) {
                                     "TF GraphDef to TFJS JSON converter\n");
 
   MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   llvm::SourceMgr source_mgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 5befdcdc513..e01c059ad90 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -72,7 +72,6 @@ tf_cc_binary(
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:MlirOptMain",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
     ],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index 82b0e613f90..5f358c61cc2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -261,6 +261,7 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
     llvm::ArrayRef<uint32_t> unroll_factors) {
   RegisterDialects();
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
 
   TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index 5b7a19a3eac..8c02a734f1d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -48,13 +48,11 @@ Type TFFrameworkDialect::parseType(DialectAsmParser &parser) const {
 
 /// Print a type registered to this dialect.
 void TFFrameworkDialect::printType(Type type, DialectAsmPrinter &os) const {
-  switch (type.getKind()) {
-    case TFFrameworkTypes::OpKernelContextType:
-      os << "op_kernel_context";
-      return;
-    default:
-      llvm_unreachable("unexpected TF Framework type kind");
+  if (type.isa<OpKernelContextType>()) {
+    os << "op_kernel_context";
+    return;
   }
+  llvm_unreachable("unexpected TF Framework type kind");
 }
 
 template <typename OpTy>
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
index a4c588a41f5..d2612a38799 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -30,22 +30,12 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-namespace TFFrameworkTypes {
-enum Kind {
-  OpKernelContextType = Type::FIRST_TF_FRAMEWORK_TYPE,
-};
-}  // namespace TFFrameworkTypes
-
 /// OpKernelContextType corresponds to C++ class OpKernelContext defined in
 /// tensorflow/core/framework/op_kernel.h
 class OpKernelContextType
     : public Type::TypeBase<OpKernelContextType, Type, TypeStorage> {
  public:
   using Base::Base;
-
-  static OpKernelContextType get(MLIRContext *context) {
-    return Base::get(context, TFFrameworkTypes::Kind::OpKernelContextType);
-  }
 };
 
 #define GET_OP_CLASSES
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
index c1af35617b1..4fb169a9729 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
@@ -90,8 +90,9 @@ int main(int argc, char **argv) {
 
   if (showDialects) {
     mlir::MLIRContext context;
+    context.loadAllGloballyRegisteredDialects();
     llvm::outs() << "Registered Dialects:\n";
-    for (mlir::Dialect *dialect : context.getRegisteredDialects()) {
+    for (mlir::Dialect *dialect : context.getLoadedDialects()) {
       llvm::outs() << dialect->getNamespace() << "\n";
     }
     return 0;
@@ -111,9 +112,12 @@ int main(int argc, char **argv) {
     exit(1);
   }
 
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
+  mlir::DialectRegistry registry;
+  registerAllDialects(registry);
+  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline, registry,
                          splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects))) {
+                         allowUnregisteredDialects,
+                         /*preloadDialectsInContext=*/true))) {
     return 1;
   }
   // Keep the output file if the invocation of MlirOptMain was successful.
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 71e18af498b..4c14bcf8960 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -55,6 +55,7 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
+        ":attribute_importer",
         ":type_to_shape",
         ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/hlo",
@@ -69,7 +70,7 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:conv_grad_size_util",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
-        "//tensorflow/core/lib/bfloat16",
+        "//tensorflow/core/platform:bfloat16",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:Dialect",
@@ -95,6 +96,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
         "//tensorflow/compiler/tf2xla:xla_compilation_device",
         "//tensorflow/compiler/tf2xla:xla_context",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index db981bb0227..e0cc89004cf 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/types/optional.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -62,7 +63,10 @@ class HloFunctionImporter {
       : context_(module.getContext()),
         module_(module),
         builder_(builder),
-        function_map_(function_map) {}
+        function_map_(function_map) {
+    context_->loadDialect<mlir::StandardOpsDialect>();
+    context_->loadDialect<mlir::mhlo::MhloDialect>();
+  }
 
   // Imports the given computation as a new function, if it hasn't been already
   // imported.
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
index dd045da3899..9db5861934f 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.cc
@@ -30,6 +30,12 @@ limitations under the License.
 
 namespace xla {
 
+HloModuleImporter::HloModuleImporter(mlir::ModuleOp module)
+    : module_(module), builder_(module.getContext()) {
+  module.getContext()->loadDialect<mlir::StandardOpsDialect>();
+  module.getContext()->loadDialect<mlir::mhlo::MhloDialect>();
+}
+
 Status HloModuleImporter::Import(const xla::HloModule& module) {
   // TODO(hinsu): Only import the entry computation here once all HLO ops with
   // reference to other computation are updated to have a region instead of a
diff --git a/tensorflow/compiler/mlir/xla/hlo_module_importer.h b/tensorflow/compiler/mlir/xla/hlo_module_importer.h
index 69ac1e28219..401299484ed 100644
--- a/tensorflow/compiler/mlir/xla/hlo_module_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_module_importer.h
@@ -38,8 +38,7 @@ class Shape;
 // dialect. HloModuleImporter does not take ownership.
 class HloModuleImporter {
  public:
-  explicit HloModuleImporter(mlir::ModuleOp module)
-      : module_(module), builder_(module.getContext()) {}
+  explicit HloModuleImporter(mlir::ModuleOp module);
 
   // Import the HloModule into the MLIR Module.
   Status Import(const xla::HloModule& module);
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index cf78c81908d..b9d563a659d 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -83,6 +83,9 @@ StatusOr<llvm::SmallVector<AffineMap, 1>> GetPermutationIfAvailable(
     strides[dim] = accumulated_stride;
     accumulated_stride *= shape.dimensions(dim);
   }
+  if (accumulated_stride == 0) {
+    return llvm::SmallVector<AffineMap, 1>{};
+  }
   return llvm::SmallVector<AffineMap, 1>{
       makeStridedLinearLayoutMap(strides, /*offset=*/0, builder.getContext())};
 }
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index c94110d9102..ac5e01a0abf 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -312,6 +312,16 @@ StatusOr<XlaOp> MlirHloBuilder::RngOpInternal(
   return CreateOp(op_name, shape, operands);
 }
 
+StatusOr<XlaOp> MlirHloBuilder::RngBitGeneratorInternal(
+    const Shape& full_result_shape, RandomAlgorithm algorithm,
+    XlaOp initial_state) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         full_result_shape, builder_));
+  auto op = builder_.create<mlir::mhlo::RngBitGeneratorOp>(
+      loc_, ty, builder_.getI32IntegerAttr(algorithm), GetValue(initial_state));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ReshapeInternal(const Shape& shape,
                                                 XlaOp operand,
                                                 int64 inferred_dimension) {
@@ -351,6 +361,13 @@ StatusOr<XlaOp> MlirHloBuilder::InDimBroadcast(
   return MakeXlaOp(op.getResult());
 }
 
+StatusOr<XlaOp> MlirHloBuilder::AddInstruction(
+    HloInstructionProto&& instr, HloOpcode opcode,
+    absl::Span<const XlaOp> operands) {
+  return Unimplemented("MlirHloBuilder does not support op %s",
+                       HloOpcodeString(opcode));
+}
+
 StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
                                         XlaOp rhs,
                                         ComparisonDirection direction) {
@@ -382,6 +399,31 @@ XlaOp MlirHloBuilder::CreateToken() {
   });
 }
 
+StatusOr<XlaOp> MlirHloBuilder::TriangularSolveInternal(
+    const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  auto op = builder_.create<mlir::mhlo::TriangularSolveOp>(
+      loc_, result_ty, GetValue(a), GetValue(b),
+      builder_.getBoolAttr(options.left_side()),
+      builder_.getBoolAttr(options.lower()),
+      builder_.getBoolAttr(options.unit_diagonal()),
+      builder_.getStringAttr(
+          TriangularSolveOptions::Transpose_Name(options.transpose_a())));
+  return MakeXlaOp(op);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
+                                                 bool lower) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  auto op = builder_.create<mlir::mhlo::CholeskyOp>(
+      loc_, result_ty, GetValue(a), builder_.getBoolAttr(lower));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::InfeedWithTokenInternal(
     const Shape& infeed_instruction_shape, XlaOp token, const string& config) {
   TF_ASSIGN_OR_RETURN(mlir::Type result_type,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index a12eb723465..00b7aa4d0b0 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -124,6 +124,13 @@ class MlirHloBuilder : public XlaBuilder {
                               FftType fft_type,
                               absl::Span<const int64> fft_length) override;
 
+  StatusOr<XlaOp> TriangularSolveInternal(
+      const Shape& shape, XlaOp a, XlaOp b,
+      TriangularSolveOptions options) override;
+
+  StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                   bool lower) override;
+
   StatusOr<XlaOp> CustomCallInternal(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape, const string& opaque,
@@ -176,6 +183,9 @@ class MlirHloBuilder : public XlaBuilder {
   StatusOr<XlaOp> RngOpInternal(RandomDistribution distribution,
                                 absl::Span<const XlaOp> parameters,
                                 const Shape& shape) override;
+  StatusOr<XlaOp> RngBitGeneratorInternal(const Shape& full_result_shape,
+                                          RandomAlgorithm algorithm,
+                                          XlaOp initial_state) override;
 
   StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
                                   int64 inferred_dimension) override;
@@ -189,6 +199,9 @@ class MlirHloBuilder : public XlaBuilder {
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> broadcast_dimensions) override;
 
+  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
+                                 absl::Span<const XlaOp> operands) override;
+
   StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                           ComparisonDirection direction) override;
 
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
index 3630d2d45e4..a83e36cff64 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
@@ -8,6 +8,6 @@ HloModule TestModule
 ENTRY TestComputation {
   x = f32[3, 2]{1,0} parameter(0)
 
-  // CHECK: "lmhlo.copy"(%{{.*}}, %{{.*}}) : (memref<3x2xf32>, memref<3x2xf32, #[[MAP]]>) -> ()
+  // CHECK: "lmhlo.copy"(%{{.*}}, %{{.*}}) {name = "copy.1"} : (memref<3x2xf32>, memref<3x2xf32, #[[MAP]]>) -> ()
   ROOT x.copy = f32[3, 2]{0,1} copy(x)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index 69eaeeb946d..cffb15022b0 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -17,9 +17,7 @@ func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) ->
 // CHECK:           [[LHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[LHSBCASTSHAPE]]
 // CHECK:           [[LHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[LHS]], [[LHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>, tensor<3xindex>) -> tensor<3x4x2xf32>
 // CHECK:           [[RHSBCASTSHAPE:%.*]] = shape.concat [[BCASTHEAD]], [[RHSTAIL]]
-// CHECK:           [[RHSSHAPEEXTENTS:%.*]] = shape.to_extent_tensor [[RHSBCASTSHAPE]]
-// CHECK:           [[RHSBCAST:%.*]] = "mhlo.dynamic_broadcast_in_dim"([[RHS]], [[RHSSHAPEEXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>, tensor<3xindex>) -> tensor<3x2x4xf32>
-// CHECK:           [[RESULT:%.*]] = "mhlo.dot_general"([[LHSBCAST]], [[RHSBCAST]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+// CHECK:           [[RESULT:%.*]] = "mhlo.dot_general"([[LHSBCAST]], [[RHS]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
 // CHECK:           return [[RESULT]] : tensor<3x4x4xf32>
 // CHECK:         }
 
@@ -29,7 +27,6 @@ func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) ->
 
 func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_lhs_batch
-// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
 // CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
 // CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
 // CHECK-SAME:        lhs_batching_dimensions = dense<0> : tensor<1xi64>,
@@ -43,7 +40,6 @@ func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>)
 func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_rhs_batch
 // CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
-// CHECK:           "mhlo.dynamic_broadcast_in_dim"({{.*}}, {{.*}}) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
 // CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
 // CHECK-SAME:        lhs_batching_dimensions = dense<0> : tensor<1xi64>,
 // CHECK-SAME:        lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
index 550b2ba4da3..876a1bf03e7 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-communication.mlir
@@ -169,7 +169,7 @@ func @send_to_host(%arg0: tensor<i32>) {
   // CHECK:      "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
   // CHECK-SAME: channel_id = {handle = 1 : i64, type = 2 : i64}
   // CHECK-SAME: is_host_transfer = true
-  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send_key"}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "send_key_dtoh_0"}
   // CHECK-SAME: (tensor<i32>, !mhlo.token) -> !mhlo.token
   "tf.XlaSendToHost"(%arg0) {key = "send_key"} : (tensor<i32>) -> ()
   return
@@ -186,7 +186,7 @@ func @recv_from_host() -> tensor<i32> {
   // CHECK:      [[RECV_TUPLE:%.*]] = "mhlo.recv"([[INIT_TOKEN]])
   // CHECK-SAME: channel_id = {handle = 1 : i64, type = 3 : i64}
   // CHECK-SAME: is_host_transfer = true
-  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv_key"}
+  // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "recv_key_htod_0"}
   // CHECK-SAME: (!mhlo.token) -> tuple<tensor<i32>, !mhlo.token>
 
 
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 5a9089756a9..93eac3821b2 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -44,7 +44,7 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
 // CHECK-LABEL: func @case
 // CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>, %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
 func @case(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
   // CHECK: %[[TUPLE_INPUT:.*]] = "mhlo.tuple"(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
   // CHECK: %[[CASE:.*]]:2 = "mhlo.case"(%[[BRANCH_INDEX]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]]) ( {
   // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index cd351447303..df4f0303a84 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -265,6 +265,47 @@ func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2
   return %0#0 : tensor<2xi32>
 }
 
+// CHECK-LABEL: bessel_i0e
+func @bessel_i0e(%arg0: tensor<3xf16>, %arg1: tensor<3xf32>, %arg2: tensor<3xf64>) -> (tensor<3xf16>, tensor<3xf32>, tensor<3xf64>) {
+  // CHECK-NOT: tf.BesselI0e
+  %0 = "tf.BesselI0e"(%arg0) : (tensor<3xf16>) -> (tensor<3xf16>)
+  %1 = "tf.BesselI0e"(%arg1) : (tensor<3xf32>) -> (tensor<3xf32>)
+  %2 = "tf.BesselI0e"(%arg2) : (tensor<3xf64>) -> (tensor<3xf64>)
+  return %0, %1, %2 : tensor<3xf16>, tensor<3xf32>, tensor<3xf64>
+}
+
+// CHECK-LABEL: bessel_i1e
+func @bessel_i1e(%arg0: tensor<3xf16>, %arg1: tensor<3xf32>, %arg2: tensor<3xf64>) -> (tensor<3xf16>, tensor<3xf32>, tensor<3xf64>) {
+  // CHECK-NOT: tf.BesselI1e
+  %0 = "tf.BesselI1e"(%arg0) : (tensor<3xf16>) -> (tensor<3xf16>)
+  %1 = "tf.BesselI1e"(%arg1) : (tensor<3xf32>) -> (tensor<3xf32>)
+  %2 = "tf.BesselI1e"(%arg2) : (tensor<3xf64>) -> (tensor<3xf64>)
+  return %0, %1, %2 : tensor<3xf16>, tensor<3xf32>, tensor<3xf64>
+}
+
+// CHECK-LABEL: diag
+func @diag(%arg0: tensor<2xf32>) -> tensor<2x2xf32> {
+  // CHECK-NOT: tf.Diag
+  %0 = "tf.Diag"(%arg0) : (tensor<2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// CHECK-LABEL: random_uniform_int
+func @random_uniform_int(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<1000xi32> {
+  %0 = "tf.Const"() {value = dense<1000> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NOT: tf.RandomUniformInt
+  %1 = "tf.RandomUniformInt"(%0, %arg0, %arg1) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<1xi32>, tensor<i32>, tensor<i32>) -> tensor<1000xi32>
+  return %1 : tensor<1000xi32>
+}
+
+// CHECK-LABEL: multinomial
+func @multinomial(%arg0: tensor<2x4xf32>, %seed: tensor<i32>, %seed2: tensor<i32>) -> tensor<2x10xi32> {
+  // CHECK-NOT: tf.Multinomial
+  %samples = "tf.Const"() { value = dense<10> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.Multinomial"(%arg0, %samples) {seed = 0, seed2 = 0}: (tensor<2x4xf32>, tensor<i32>) -> tensor<2x10xi32>
+  return %1 : tensor<2x10xi32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 9b32fb97260..56d4236c0a0 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1499,6 +1499,35 @@ func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (te
   return %arg1, %arg0 : tensor<i32>, tensor<i32>
 }
 
+//===----------------------------------------------------------------------===//
+// Elu op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @elu
+func @elu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[PRED:.*]] = chlo.broadcast_compare %arg0, %[[ZERO]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GT"}
+  // CHECK-DAG: %[[EXP:.*]] = "mhlo.exponential_minus_one"(%arg0)
+  // CHECK: %[[RESULT:.*]] = "mhlo.select"(%[[PRED]], %arg0, %[[EXP]])
+  // CHECK: return %[[RESULT]]
+  %0 = "tf.Elu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  return %0: tensor<1xf32>
+}
+
+// CHECK-LABEL: func @elu_grad
+// CHECK-SAME: (%[[GRADIENTS:.*]]: tensor<4x8xf32>, %[[FEATURES:.*]]: tensor<?x?xf32>)
+func @elu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tensor<4x8xf32> {
+  // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[PRED:.*]] = chlo.broadcast_compare %[[FEATURES]], %[[ZERO]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = "GT"}
+  // CHECK-DAG: %[[ADD1:.*]] = chlo.broadcast_add %[[FEATURES]], %[[ONE]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: %[[MULGRAD:.*]] = "mhlo.multiply"(%[[GRADIENTS]], %[[ADD1]])
+  // CHECK: %[[RESULT:.*]] = "mhlo.select"(%[[PRED]], %[[GRADIENTS]], %[[MULGRAD]])
+  // CHECK: return %[[RESULT]]
+  %2 = "tf.EluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor<?x?xf32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Relu op legalizations.
 //===----------------------------------------------------------------------===//
@@ -3484,6 +3513,20 @@ func @conv3d_backprop_filter(%input: tensor<2x8x8x8x1xf32>, %out_backprop: tenso
   return %result : tensor<2x8x8x8x1xf32>
 }
 
+// CHECK-LABEL: @collective_permute
+func @collective_permute(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+  %source_target_pairs = "tf.Const" () {
+    value = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi32>
+  } : () -> tensor<3x2xi32>
+
+  // CHECK: "mhlo.collective_permute"
+  // CHECK-SAME: source_target_pairs = dense<{{\[}}[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
+  %0 = "tf.CollectivePermute"(%arg0, %source_target_pairs) {
+  } : (tensor<128x32xf32>, tensor<3x2xi32>) -> tensor<128x32xf32>
+
+  return %0 : tensor<128x32xf32>
+}
+
 // CHECK-LABEL: @cross_replica_sum
 func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
   %replica_groups = "tf.Const" () {
@@ -3504,8 +3547,9 @@ func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
 func @size_scalar_i32(%input: tensor<f32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK-SAME: tensor<i32>
+  // CHECK: %[[CAST:.*]] = tensor_cast %[[CONST]] : tensor<i32> to tensor<i32>
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<f32>) -> tensor<i32>
-  // CHECK: return %[[CONST]]
+  // CHECK: return %[[CAST]]
   return %size : tensor<i32>
 }
 
@@ -3513,8 +3557,9 @@ func @size_scalar_i32(%input: tensor<f32>) -> (tensor<i32>) {
 func @size_scalar_i64(%input: tensor<f32>) -> (tensor<i64>) {
   // CHECK: %[[CONST:.*]] = mhlo.constant dense<1>
   // CHECK-SAME: tensor<i64>
+  // CHECK: %[[CAST:.*]] = tensor_cast %[[CONST]] : tensor<i64> to tensor<i64>
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT64"} : (tensor<f32>) -> tensor<i64>
-  // CHECK: return %[[CONST]]
+  // CHECK: return %[[CAST]]
   return %size : tensor<i64>
 }
 
@@ -3775,7 +3820,7 @@ func @unsorted_segment_prod(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x1
 // CHECK-LABEL: @unsorted_segment_min
 func @unsorted_segment_min(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16xi32>) -> (tensor<4x?xf32>) {
   %num_segments = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: mhlo.constant dense<0x7F800000> : tensor<f32>
+  // CHECK: mhlo.constant dense<3.40282347E+38> : tensor<f32>
   // CHECK: mhlo.scatter
   // CHECK: mhlo.minimum
   %0 = "tf.UnsortedSegmentMin"(%data, %segment_ids, %num_segments) : (tensor<8x?x64xf32>, tensor<?x16xi32>, tensor<i32>) -> (tensor<4x?xf32>)
@@ -3785,7 +3830,7 @@ func @unsorted_segment_min(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16
 // CHECK-LABEL: @unsorted_segment_max
 func @unsorted_segment_max(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16xi32>) -> (tensor<4x?xf32>) {
   %num_segments = "tf.Const"() {value = dense<4> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: mhlo.constant dense<0xFF800000> : tensor<f32>
+  // CHECK: mhlo.constant dense<-3.40282347E+38> : tensor<f32>
   // CHECK: mhlo.scatter
   // CHECK: mhlo.maximum
   %0 = "tf.UnsortedSegmentMax"(%data, %segment_ids, %num_segments) : (tensor<8x?x64xf32>, tensor<?x16xi32>, tensor<i32>) -> (tensor<4x?xf32>)
@@ -4668,6 +4713,20 @@ func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Cumprod op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @cumprod
+func @cumprod(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: [[INIT:%.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK: "mhlo.reduce_window"({{.*}}, [[INIT]]) ( {
+  // CHECK:   mhlo.mul
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumprod"(%arg0, %0) {exclusive = false, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Qr op legalization
 //===----------------------------------------------------------------------===//
@@ -4766,3 +4825,37 @@ func @softplus_f64(%arg0: tensor<8x16xf64>) -> tensor<8x16xf64> {
   // CHECK:     return [[ENTRY_SELECT]] : tensor<8x16xf64>
   return %0 : tensor<8x16xf64>
 }
+
+// CHECK-LABEL: @xla_gather
+func @xla_gather(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x1x300xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 1, 300]> : tensor<3xi64> } : () -> tensor<3xi64>
+
+  // CHECK: "mhlo.gather"
+  // CHECK-SAME: dimension_numbers =
+  // CHECK-SAME:   collapsed_slice_dims = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   index_vector_dim = 1 : i64
+  // CHECK-SAME:   offset_dims = dense<1> : tensor<1xi64>
+  // CHECK-SAME:   start_index_map = dense<0> : tensor<1xi64>
+  // CHECK-SAME: indices_are_sorted = true
+  // CHECK-SAME: slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
+
+  %0 = "tf.XlaGather"(%arg0, %arg1, %cst) {dimension_numbers = "\0A\01\01\12\01\00\1A\01\00 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi64>) -> tensor<10x1x300xf32>
+  return %0 : tensor<10x1x300xf32>
+}
+
+// CHECK-LABEL: @xla_gather_i32
+func @xla_gather_i32(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x1x300xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 1, 300]> : tensor<3xi32> } : () -> tensor<3xi32>
+
+  // CHECK: "mhlo.gather"
+  // CHECK-SAME: dimension_numbers =
+  // CHECK-SAME:   collapsed_slice_dims = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   index_vector_dim = 1 : i64
+  // CHECK-SAME:   offset_dims = dense<1> : tensor<1xi64>
+  // CHECK-SAME:   start_index_map = dense<0> : tensor<1xi64>
+  // CHECK-SAME: indices_are_sorted = true
+  // CHECK-SAME: slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
+
+  %0 = "tf.XlaGather"(%arg0, %arg1, %cst) {dimension_numbers = "\0A\01\01\12\01\00\1A\01\00 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi32>) -> tensor<10x1x300xf32>
+  return %0 : tensor<10x1x300xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 5fe933ee635..3462b3b7a5a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/utils/hlo_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
+#include "tensorflow/compiler/mlir/xla/attribute_importer.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
 #include "tensorflow/compiler/xla/client/padding.h"
@@ -57,7 +58,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -262,49 +263,21 @@ tensorflow::TensorShape ToTensorShape(
       sizes.begin(), sizes.end()));
 }
 
-// Returns minimal value for the given int or float element type.
-static ConstOp GetMinValueForType(Type ty, Location loc,
-                                  PatternRewriter *rewriter) {
-  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
-
-  DenseElementsAttr attr;
-  if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
-    APFloat neg_inf =
-        APFloat::getInf(float_ty.getFloatSemantics(), /*negative=*/true);
-    attr = DenseElementsAttr::get(scalar_ty, neg_inf);
-  } else {
-    auto int_ty = ty.cast<IntegerType>();
-    APInt min_val = APInt::getSignedMinValue(int_ty.getWidth());
-    attr = DenseElementsAttr::get(scalar_ty, min_val);
-  }
-  return rewriter->create<ConstOp>(loc, attr);
-}
-
-// Returns maximal value for the given int or float element type.
-static ConstOp GetMaxValueForType(Type ty, Location loc,
-                                  PatternRewriter *rewriter) {
-  RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
-
-  DenseElementsAttr attr;
-  if (auto float_ty = ty.dyn_cast_or_null<FloatType>()) {
-    APFloat pos_inf =
-        APFloat::getInf(float_ty.getFloatSemantics(), /*negative=*/false);
-    attr = DenseElementsAttr::get(scalar_ty, pos_inf);
-  } else {
-    auto int_ty = ty.cast<IntegerType>();
-    APInt max_val = APInt::getSignedMaxValue(int_ty.getWidth());
-    attr = DenseElementsAttr::get(scalar_ty, max_val);
-  }
-  return rewriter->create<ConstOp>(loc, attr);
-}
-
-// Returns int or float scalar DenseElementsAttr attribute with the given
-// element type and the value.
+// Returns int, float, or complex scalar DenseElementsAttr attribute with the
+// given element type and the value.
 static ConstOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
                                     OpBuilder *builder) {
   return builder->create<ConstOp>(loc, hlo::GetScalarOfType(ty, raw_value));
 }
 
+// Returns a limit scalar const op for the given type.
+// Requires FloatType or IntegerType
+static ConstOp GetScalarLimitConstOfType(Type ty, Location loc,
+                                         hlo::ScalarLimit limit,
+                                         OpBuilder *builder) {
+  return builder->create<ConstOp>(loc, hlo::GetScalarLimitOfType(ty, limit));
+}
+
 // Creates an mhlo::SliceOp where the major dimensions have full size, and
 // the minor dimensions have the provided offsets and sizes.
 static Value SliceInMinorDims(Location loc, Value v,
@@ -1065,6 +1038,21 @@ static void BuildSortComparisonBody(llvm::ArrayRef<Type> element_types,
   builder->create<mhlo::ReturnOp>(loc, compare);
 }
 
+//===----------------------------------------------------------------------===//
+// XlaGather op utilities.
+//===----------------------------------------------------------------------===//
+
+bool HasValidGatherDims(StringAttr attr) {
+  ::xla::GatherDimensionNumbers dims;
+  return dims.ParseFromString(attr.getValue().str());
+}
+
+GatherDimensionNumbers GetGatherDimNumsAttr(StringAttr attr, Builder *builder) {
+  ::xla::GatherDimensionNumbers dims;
+  if (!dims.ParseFromString(attr.getValue().str())) return {};
+  return ::xla::ConvertGatherDimensionNumbers(dims, builder);
+}
+
 //===----------------------------------------------------------------------===//
 // Op converters.
 //===----------------------------------------------------------------------===//
@@ -2385,15 +2373,16 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
         op.input().getType().template cast<TensorType>().getElementType();
     if (!element_type.isSignlessIntOrFloat()) return failure();
     Location loc = op.getLoc();
-    ConstOp init = GetMinValueForType(element_type, loc, &rewriter);
+    ConstOp init = GetScalarLimitConstOfType(element_type, loc,
+                                             hlo::kInfinityLowest, &rewriter);
 
     auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
-        loc, op.getType(), op.input(), init.getResult(),
-        GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
+        loc, op.getType(), op.input(), init, GetI64ElementsAttr(op.ksize()),
+        GetI64ElementsAttr(op.strides()),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
     BuildReduceBody<MaxOp>(element_type, &reduce.body(), &rewriter);
@@ -3636,7 +3625,8 @@ class ConvertMaxOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMinValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc,
+                                     hlo::kInfinityLowest, rewriter);
   }
 };
 
@@ -3653,7 +3643,8 @@ class ConvertMinOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMaxValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc,
+                                     hlo::kInfinityMax, rewriter);
   }
 };
 
@@ -3789,7 +3780,8 @@ class ConvertArgMaxOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter &rewriter) {
-    return GetMinValueForType(reduce_element_type, loc, &rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc,
+                                     hlo::kInfinityLowest, &rewriter);
   }
 
   static StringRef GetDirection() { return "GT"; }
@@ -4728,7 +4720,7 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     auto output_type =
         RankedTensorType::get(output_shape, data_type.getElementType());
 
-    // Broadccast the initial value for reduction. This will become the
+    // Broadcast the initial value for reduction. This will become the
     // 'operand' parameter to scatter to for the final scatter op.
     Value init = ConcreteClass::GetInitialValue(data_type.getElementType(),
                                                 op.getLoc(), &rewriter);
@@ -4768,7 +4760,8 @@ class ConvertUnsortedSegmentMaxOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMinValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc, hlo::kLowest,
+                                     rewriter);
   }
 };
 
@@ -4781,7 +4774,8 @@ class ConvertUnsortedSegmentMinOp
 
   static Value GetInitialValue(Type reduce_element_type, Location loc,
                                PatternRewriter *rewriter) {
-    return GetMaxValueForType(reduce_element_type, loc, rewriter);
+    return GetScalarLimitConstOfType(reduce_element_type, loc, hlo::kMax,
+                                     rewriter);
   }
 };
 
@@ -5092,17 +5086,19 @@ class ConvertXlaDynamicUpdateSliceOp
   }
 };
 
-/// Converts the Cumsum TensorFlow op to the HLO ReduceWindow op by setting
-/// appropriate window dimensions, with 'add' as the reduction function.  The
-/// input tensor needs to have a static shape, and 'axis' must be const.  The
-/// TableGen pattern is not used for this rewrite because it involves regions.
-class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
-  using OpRewritePattern<TF::CumsumOp>::OpRewritePattern;
+// Converts the Cumsum or Cumprod TensorFlow op to the HLO ReduceWindow op by
+// setting appropriate window dimensions, with the given aggregation op as the
+// reduction function. The input tensor needs to have a static shape, and 'axis'
+// must be const. The TableGen pattern is not used for this rewrite because it
+// involves regions.
+template <typename OpT, typename AggregationOp>
+class ConvertCumOp : public OpRewritePattern<OpT> {
+  using OpRewritePattern<OpT>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::CumsumOp op,
+  LogicalResult matchAndRewrite(OpT op,
                                 PatternRewriter &rewriter) const override {
     auto input = op.x();
-    auto input_type = input.getType().dyn_cast<ShapedType>();
+    auto input_type = input.getType().template dyn_cast<ShapedType>();
     if (!input_type || !input_type.hasStaticShape()) {
       return failure();
     }
@@ -5135,6 +5131,10 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
     // Convert if we need to enlarge the element type's bitwidth to avoid
     // precision loss.
     Type input_element_type = input_type.getElementType();
+
+    // TODO(hinsu): Handle complex element types.
+    if (!input_element_type.isIntOrFloat()) return failure();
+
     Type sum_element_type = GetSumAccumulationType(input_element_type);
     input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
 
@@ -5148,8 +5148,9 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
         RankedTensorType::get({rank, 2}, rewriter.getIntegerType(64)),
         paddings);
 
-    Value init =
-        GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
+    int64_t init_value = (std::is_same<AggregationOp, AddOp>::value) ? 0 : 1;
+    Value init = GetScalarConstOfType(sum_element_type, op.getLoc(), init_value,
+                                      &rewriter);
 
     auto reduce = rewriter.create<ReduceWindowOp>(
         op.getLoc(), input_type, input, init,
@@ -5157,7 +5158,7 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
         GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_strides)),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
-    BuildReduceBody<AddOp>(sum_element_type, &reduce.body(), &rewriter);
+    BuildReduceBody<AggregationOp>(sum_element_type, &reduce.body(), &rewriter);
     Value result = reduce.getResult();
 
     if (op.exclusive()) {
@@ -5193,6 +5194,9 @@ class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
   }
 };
 
+using ConvertCumsumOp = ConvertCumOp<TF::CumsumOp, AddOp>;
+using ConvertCumprodOp = ConvertCumOp<TF::CumprodOp, MulOp>;
+
 // Converts the Tensorflow ShapeOp to a sequence of Shape dialect and Standard
 // dialect lowerings. This involves extracting the shape type, extracting and
 // converting each dimension to a known integer type, and repacking into a final
@@ -5857,7 +5861,7 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
       ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
       ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
       ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
-      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
+      ConvertCumprodOp, ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
       ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
       ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV2Op,
       ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
index 1d6ce36300f..1f884b1bdea 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
@@ -215,11 +215,17 @@ void SetOpSharding(Operation* op, int64_t tpu_core) {
 }
 
 // Assigns frontend attributes holding information about data type and
-// TensorFlow rendezvous channel name.
-void SetFrontendAttributes(Operation* op, StringRef key, Type type) {
+// TensorFlow rendezvous channel name. The TensorFlow rendezvous channel name is
+// handled differently as individual names are used per data send and receive.
+void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
+                           Type type, bool device_to_host) {
   MLIRContext* context = op->getContext();
 
-  auto rendezvous_name = StringAttr::get(key, context);
+  std::string formatted_key =
+      device_to_host ? llvm::formatv("{0}_dtoh_{1}", key, index).str()
+                     : llvm::formatv("{0}_htod_{1}", key, index).str();
+
+  auto rendezvous_name = StringAttr::get(formatted_key, context);
   auto rendezvous_name_attr = NamedAttribute(
       Identifier::get(kXlaHostTransferRendezvousNameAttr, context),
       rendezvous_name);
@@ -239,24 +245,10 @@ void SetFrontendAttributes(Operation* op, StringRef key, Type type) {
   op->setAttr(kFrontendAttributesAttr, frontend_attributes);
 }
 
-// Assigns frontend attributes holding information about data type and
-// TensorFlow rendezvous channel name specific to `tf._XlaHostComputeMlir`.
-// TensorFlow rendezvous channel name is handled differently as individual names
-// are used per data send and receive.
-void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
-                           Type type, bool device_to_host) {
-  std::string formatted_key =
-      device_to_host ? llvm::formatv("{0}_dtoh_{1}", key, index).str()
-                     : llvm::formatv("{0}_htod_{1}", key, index).str();
-
-  return SetFrontendAttributes(op, formatted_key, type);
-}
-
-// Creates a `mhlo.send` op for sending value `operand`. If `index` is set,
-// `key` will be rewritten with a suffix and index. If `tpu_core` is set, op
-// sharding for the respective device will be set.
+// Creates a `mhlo.send` op for sending value `operand`. If `tpu_core` is set,
+// op sharding for the respective device will be set.
 Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
-                   Value operand, StringRef key, const Optional<size_t>& index,
+                   Value operand, StringRef key, size_t index,
                    const Optional<int64_t>& tpu_core, Value token) {
   // type 2 == DEVICE_TO_HOST
   auto channel_handle = ChannelHandle::get(
@@ -266,23 +258,18 @@ Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
       loc, token.getType(), operand, token, channel_handle,
       /*is_host_transfer=*/builder.getBoolAttr(true));
 
-  if (index) {
-    SetFrontendAttributes(send, *index, key, operand.getType(),
-                          /*device_to_host=*/true);
-  } else {
-    SetFrontendAttributes(send, key, operand.getType());
-  }
+  SetFrontendAttributes(send, index, key, operand.getType(),
+                        /*device_to_host=*/true);
 
   if (tpu_core) SetOpSharding(send, *tpu_core);
 
   return send.getResult();
 }
 
-// Creates a `mhlo.recv` op for receiving a value. If `index` is set, `key` will
-// be rewritten with a suffix and index. If `tpu_core` is set, op sharding for
-// the respective device will be set.
+// Creates a `mhlo.recv` op for receiving a value. If `tpu_core` is set, op
+// sharding for the respective device will be set.
 Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
-                   Value result, StringRef key, const Optional<size_t>& index,
+                   Value result, StringRef key, size_t index,
                    const Optional<int64_t>& tpu_core, Value token) {
   // type 3 == HOST_TO_DEVICE
   auto channel_handle = ChannelHandle::get(
@@ -294,12 +281,10 @@ Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
   auto recv =
       builder.create<RecvOp>(loc, recv_result_type, token, channel_handle,
                              /*is_host_transfer=*/builder.getBoolAttr(true));
-  if (index) {
-    SetFrontendAttributes(recv, *index, key, result_type,
-                          /*device_to_host=*/false);
-  } else {
-    SetFrontendAttributes(recv, key, result.getType());
-  }
+
+  SetFrontendAttributes(recv, index, key, result_type,
+                        /*device_to_host=*/false);
+
   if (tpu_core) SetOpSharding(recv, *tpu_core);
 
   auto get_tuple_element =
@@ -369,7 +354,7 @@ Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
   builder.setInsertionPoint(send_to_host);
   token = CreateSendOp(builder, channel_id, send_to_host.getLoc(),
                        send_to_host.input(), send_to_host.key(),
-                       /*index=*/llvm::None, /*tpu_core=*/llvm::None, token);
+                       /*index=*/0, /*tpu_core=*/llvm::None, token);
 
   send_to_host.erase();
   return token;
@@ -381,7 +366,7 @@ Value RewriteRecvFromHostOp(OpBuilder& builder, int64_t& channel_id,
   builder.setInsertionPoint(recv_from_host);
   token = CreateRecvOp(builder, channel_id, recv_from_host.getLoc(),
                        recv_from_host.output(), recv_from_host.key(),
-                       /*index=*/llvm::None, /*tpu_core=*/llvm::None, token);
+                       /*index=*/0, /*tpu_core=*/llvm::None, token);
 
   recv_from_host.erase();
   return token;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 1d4c9503afa..73ce305091c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -51,6 +51,10 @@ def GetHLOAxisFromTFAxisVariadic : NativeCodeCall<
   "$0, (*$1.begin()).getType().cast<RankedTensorType>().getRank(), "
   "&$_builder)">;
 
+def CastElementsToI64Elements : NativeCodeCall<
+  "hlo::ConvertElementsAttr("
+    "$0, $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+
 def : Pattern<
     (TF_FusedBatchNormOp:$root $x, $scale, $offset, $mean, $variance, $epsilon,
                          $exponential_avg_factor, $data_format,
@@ -255,12 +259,16 @@ def : Pat<(TF_ConcatV2Op $inputs, (TF_ConstOp OneElementAttr:$axis)),
           [(HasRankedFirstOperand $inputs)]>;
 
 //===----------------------------------------------------------------------===//
-// CrossReplicaSum op patterns.
+// CollectivePermute op patterns.
 //===----------------------------------------------------------------------===//
 
-def CastElementsToI64Elements : NativeCodeCall<
-  "hlo::ConvertElementsAttr("
-    "$0, $_builder.getIntegerType(64)).cast<DenseIntElementsAttr>()">;
+def : Pat<(TF_CollectivePermuteOp $input, (TF_ConstOp $source_target_pairs)),
+          (HLO_CollectivePermuteOp $input,
+            (CastElementsToI64Elements $source_target_pairs))>;
+
+//===----------------------------------------------------------------------===//
+// CrossReplicaSum op patterns.
+//===----------------------------------------------------------------------===//
 
 def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
           (HLO_CrossReplicaSumOp $input,
@@ -427,6 +435,35 @@ def : Pat<(TF_ConstOp:$res ElementsAttr:$value),
           (TensorCastOp (HLO_ConstOp $value)),
           [(HLO_Tensor $res)]>;
 
+//===----------------------------------------------------------------------===//
+// Elu op patterns.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(TF_EluOp AnyRankedTensor:$features),
+          (HLO_SelectOp
+           (HLOClient_BroadcastCompareOp
+              $features,
+              (HLO_ConstOp:$zero (GetScalarOfType<0> $features)),
+              (BinBroadcastDimensions $zero, $features),
+              HLO_COMPARISON_DIRECTION_GT),
+           $features,
+           (HLO_Expm1Op $features))>;
+
+def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
+           (HLO_SelectOp
+            (HLOClient_BroadcastCompareOp
+              $features,
+              (HLO_ConstOp:$zero (GetScalarOfType<0> $features)),
+              (BinBroadcastDimensions $zero, $features),
+              HLO_COMPARISON_DIRECTION_GT),
+            $gradients,
+            (HLO_MulOp
+             $gradients,
+             (HLOClient_BroadcastAddOp
+               $features,
+               (HLO_ConstOp:$one (GetScalarOfType<1> $features)),
+               (BinBroadcastDimensions $one, $features))))>;
+
 //===----------------------------------------------------------------------===//
 // Relu op patterns.
 //===----------------------------------------------------------------------===//
@@ -660,3 +697,19 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                 ),
                 (replaceWithValue $output)
               ]>;
+
+//===----------------------------------------------------------------------===//
+// XlaGather op.
+//===----------------------------------------------------------------------===//
+
+def ToGatherDimNumsAttr : NativeCodeCall<"GetGatherDimNumsAttr($0, &$_builder)">;
+
+def HasValidGatherDims : Constraint<CPred<"HasValidGatherDims($0)">>;
+
+def : Pat<(TF_XlaGatherOp $operand, $start_indices, (TF_ConstOp $slice_sizes),
+                          $dimension_numbers, $indices_are_sorted),
+          (HLO_GatherOp $operand, $start_indices,
+                        (ToGatherDimNumsAttr $dimension_numbers),
+                        (CastElementsToI64Elements $slice_sizes),
+                        $indices_are_sorted),
+          [(HasValidGatherDims $dimension_numbers)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 904b80e05b1..2f73d1a54df 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -81,6 +82,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
   // TODO(hinsu): Drop explicit allowlist when MLIR based bridge is enabled for
   // all tf2xla kernels.
   // clang-format off
+
   static llvm::SmallDenseSet<mlir::TypeID, 512> ops = {
     TypeID::get<TF::AbsOp>(),
     TypeID::get<TF::AcoshOp>(),
@@ -102,6 +104,9 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::BatchMatMulV2Op>(),
     TypeID::get<TF::BatchToSpaceNDOp>(),
     TypeID::get<TF::BatchToSpaceOp>(),
+    TypeID::get<TF::BesselI0eOp>(),
+    TypeID::get<TF::BesselI1eOp>(),
+    TypeID::get<TF::BetaincOp>(),
     TypeID::get<TF::BiasAddGradOp>(),
     TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BitwiseAndOp>(),
@@ -110,12 +115,17 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::BucketizeOp>(),
     TypeID::get<TF::CastOp>(),
     TypeID::get<TF::ClipByValueOp>(),
+    TypeID::get<TF::CholeskyOp>(),
     TypeID::get<TF::ComplexAbsOp>(),
     TypeID::get<TF::ConjugateTransposeOp>(),
     TypeID::get<TF::CoshOp>(),
     TypeID::get<TF::CrossOp>(),
     TypeID::get<TF::DataFormatDimMapOp>(),
     TypeID::get<TF::DataFormatVecPermuteOp>(),
+    TypeID::get<TF::DepthToSpaceOp>(),
+    TypeID::get<TF::DepthwiseConv2dNativeBackpropFilterOp>(),
+    TypeID::get<TF::DepthwiseConv2dNativeBackpropInputOp>(),
+    TypeID::get<TF::DiagOp>(),
     TypeID::get<TF::DigammaOp>(),
     TypeID::get<TF::DivNoNanOp>(),
     TypeID::get<TF::EluGradOp>(),
@@ -124,6 +134,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::ErfcOp>(),
     TypeID::get<TF::ErfOp>(),
     TypeID::get<TF::Expm1Op>(),
+    TypeID::get<TF::ExtractImagePatchesOp>(),
     TypeID::get<TF::FFT2DOp>(),
     TypeID::get<TF::FFT3DOp>(),
     TypeID::get<TF::FFTOp>(),
@@ -139,6 +150,9 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::IRFFT2DOp>(),
     TypeID::get<TF::IRFFT3DOp>(),
     TypeID::get<TF::IRFFTOp>(),
+    TypeID::get<TF::IgammaOp>(),
+    TypeID::get<TF::IgammacOp>(),
+    TypeID::get<TF::IgammaGradAOp>(),
     TypeID::get<TF::InvertOp>(),
     TypeID::get<TF::InvOp>(),
     TypeID::get<TF::LRNOp>(),
@@ -149,26 +163,38 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::LessEqualOp>(),
     TypeID::get<TF::LessOp>(),
     TypeID::get<TF::LgammaOp>(),
+    TypeID::get<TF::ListDiffOp>(),
     TypeID::get<TF::LogicalAndOp>(),
     TypeID::get<TF::LogicalNotOp>(),
     TypeID::get<TF::LogicalOrOp>(),
     TypeID::get<TF::LogOp>(),
+    TypeID::get<TF::LowerBoundOp>(),
     TypeID::get<TF::MatMulOp>(),
     TypeID::get<TF::MatrixDiagV3Op>(),
+    TypeID::get<TF::MatrixInverseOp>(),
     TypeID::get<TF::MatrixSetDiagV3Op>(),
+    TypeID::get<TF::MatrixSolveOp>(),
+    TypeID::get<TF::MatrixTriangularSolveOp>(),
     TypeID::get<TF::MirrorPadOp>(),
     TypeID::get<TF::MulOp>(),
+    TypeID::get<TF::MultinomialOp>(),
     TypeID::get<TF::NegOp>(),
     TypeID::get<TF::NonMaxSuppressionV4Op>(),
     TypeID::get<TF::NotEqualOp>(),
     TypeID::get<TF::PadOp>(),
     TypeID::get<TF::PlaceholderWithDefaultOp>(),
     TypeID::get<TF::PowOp>(),
+    // TODO(hinsu): Canonicalize QuantizeAndDequantize and
+    // QuantizeAndDequantizeV2 to QuantizeAndDequantizeV3 by converting
+    // attributes to operands.
+    TypeID::get<TF::QuantizeAndDequantizeOp>(),
+    TypeID::get<TF::QuantizeAndDequantizeV2Op>(),
+    TypeID::get<TF::QuantizeAndDequantizeV3Op>(),
     TypeID::get<TF::RFFT2DOp>(),
     TypeID::get<TF::RFFT3DOp>(),
     TypeID::get<TF::RGBToHSVOp>(),
+    TypeID::get<TF::RandomUniformIntOp>(),
     TypeID::get<TF::RealDivOp>(),
-    TypeID::get<TF::ReciprocalOp>(),
     TypeID::get<TF::ReciprocalGradOp>(),
     TypeID::get<TF::Relu6GradOp>(),
     TypeID::get<TF::ResizeBilinearOp>(),
@@ -177,6 +203,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::ReverseSequenceOp>(),
     TypeID::get<TF::RightShiftOp>(),
     TypeID::get<TF::RintOp>(),
+    TypeID::get<TF::RollOp>(),
     TypeID::get<TF::RoundOp>(),
     TypeID::get<TF::SelectV2Op>(),
     TypeID::get<TF::SelfAdjointEigV2Op>(),
@@ -190,9 +217,15 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::SoftsignOp>(),
     TypeID::get<TF::SpaceToBatchNDOp>(),
     TypeID::get<TF::SpaceToBatchOp>(),
+    TypeID::get<TF::SpaceToDepthOp>(),
     TypeID::get<TF::SparseToDenseOp>(),
     TypeID::get<TF::SqrtGradOp>(),
     TypeID::get<TF::SquareOp>(),
+    TypeID::get<TF::StatelessMultinomialOp>(),
+    TypeID::get<TF::StatelessRandomNormalOp>(),
+    TypeID::get<TF::StatelessRandomUniformOp>(),
+    TypeID::get<TF::StatelessRandomUniformIntOp>(),
+    TypeID::get<TF::StatelessTruncatedNormalOp>(),
     TypeID::get<TF::SubOp>(),
     TypeID::get<TF::TanOp>(),
     TypeID::get<TF::TransposeOp>(),
@@ -200,6 +233,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::TruncatedNormalOp>(),
     TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::UnpackOp>(),
+    TypeID::get<TF::UpperBoundOp>(),
     TypeID::get<TF::XdivyOp>(),
     TypeID::get<TF::XlaBroadcastHelperOp>(),
     TypeID::get<TF::XlaConvOp>(),
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
index cc74d82839b..22462428367 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassOptions.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
@@ -182,7 +181,10 @@ template <typename OpType>
 StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
     HloInstruction* instr) {
   Location loc = getLocation(instr);
-  ArrayRef<std::pair<Identifier, Attribute>> attrs;
+  std::pair<Identifier, Attribute> attrs[] = {
+      {Identifier::get("name", builder_.getContext()),
+       builder_.getStringAttr(instr->name())},
+  };
   ArrayRef<Type> rets{};
 
   llvm::SmallVector<Value, 4> operands;
@@ -252,15 +254,14 @@ Status LhloDialectEmitter::DefaultAction(HloInstruction* instr) {
   return Status::OK();
 }
 
-StatusOr<mlir::Operation*> LhloDialectEmitter::EmitSortOp(
-    HloInstruction* instr) {
+StatusOr<lmhlo::SortOp> LhloDialectEmitter::EmitSortOp(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<lmhlo::SortOp>(instr));
   auto* sort_instr = ::xla::Cast<::xla::HloSortInstruction>(instr);
   sort.dimensionAttr(builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
   sort.is_stableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
   TF_RETURN_IF_ERROR(::xla::HloFunctionImporter::ImportAsRegion(
       *sort_instr->called_computations()[0], &sort.comparator(), &builder_));
-  return sort.getOperation();
+  return sort;
 }
 
 Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
index b191d53840d..89514116254 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
@@ -41,7 +42,7 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
         builder_(module.getContext()),
         i8_type_(builder_.getIntegerType(8)) {}
 
-  ::xla::StatusOr<mlir::Operation*> EmitSortOp(::xla::HloInstruction* instr);
+  ::xla::StatusOr<lmhlo::SortOp> EmitSortOp(::xla::HloInstruction* instr);
 
  private:
   template <typename OpType>
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index afc36916348..b725f56b455 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -43,47 +43,41 @@ using xla::ShapeUtil;
 namespace xla {
 
 PrimitiveType TypeToPrimitiveType(mlir::Type type) {
-  switch (type.getKind()) {
-    case mlir::StandardTypes::BF16:
-      return PrimitiveType::BF16;
-    case mlir::StandardTypes::Complex: {
-      mlir::Type element_ty = type.cast<mlir::ComplexType>().getElementType();
-      switch (element_ty.getKind()) {
-        case mlir::StandardTypes::F32:
-          return PrimitiveType::C64;
-        case mlir::StandardTypes::F64:
-          return PrimitiveType::C128;
-        default:
-          return PrimitiveType::PRIMITIVE_TYPE_INVALID;
-      }
+  if (type.isBF16()) {
+    return PrimitiveType::BF16;
+  } else if (type.isF16()) {
+    return PrimitiveType::F16;
+  } else if (type.isF32()) {
+    return PrimitiveType::F32;
+  } else if (type.isF64()) {
+    return PrimitiveType::F64;
+  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+    mlir::Type element_ty = complex_type.getElementType();
+    if (element_ty.isF32()) {
+      return PrimitiveType::C64;
+
+    } else if (element_ty.isF64()) {
+      return PrimitiveType::C128;
     }
-    case mlir::StandardTypes::F16:
-      return PrimitiveType::F16;
-    case mlir::StandardTypes::F32:
-      return PrimitiveType::F32;
-    case mlir::StandardTypes::F64:
-      return PrimitiveType::F64;
-    case mlir::StandardTypes::Integer: {
-      const auto integer = type.cast<IntegerType>();
-      bool is_unsigned = integer.isUnsigned();
-      switch (integer.getWidth()) {
-        case 1:
-          return PrimitiveType::PRED;
-        case 8:
-          return is_unsigned ? PrimitiveType::U8 : PrimitiveType::S8;
-        case 16:
-          return is_unsigned ? PrimitiveType::U16 : PrimitiveType::S16;
-        case 32:
-          return is_unsigned ? PrimitiveType::U32 : PrimitiveType::S32;
-        case 64:
-          return is_unsigned ? PrimitiveType::U64 : PrimitiveType::S64;
-        default:
-          return PrimitiveType::PRIMITIVE_TYPE_INVALID;
-      }
+    return PrimitiveType::PRIMITIVE_TYPE_INVALID;
+  } else if (auto integer_type = type.dyn_cast<mlir::IntegerType>()) {
+    bool is_unsigned = integer_type.isUnsigned();
+    switch (integer_type.getWidth()) {
+      case 1:
+        return PrimitiveType::PRED;
+      case 8:
+        return is_unsigned ? PrimitiveType::U8 : PrimitiveType::S8;
+      case 16:
+        return is_unsigned ? PrimitiveType::U16 : PrimitiveType::S16;
+      case 32:
+        return is_unsigned ? PrimitiveType::U32 : PrimitiveType::S32;
+      case 64:
+        return is_unsigned ? PrimitiveType::U64 : PrimitiveType::S64;
+      default:
+        return PrimitiveType::PRIMITIVE_TYPE_INVALID;
     }
-    default:
-      return PrimitiveType::PRIMITIVE_TYPE_INVALID;
   }
+  return PrimitiveType::PRIMITIVE_TYPE_INVALID;
 }
 
 StatusOr<Shape> TypeToShape(
@@ -108,108 +102,89 @@ Shape TypeToShape(mlir::Type type) {
   if (ptype != PrimitiveType::PRIMITIVE_TYPE_INVALID)
     return ShapeUtil::MakeShape(ptype, {});
 
-  switch (type.getKind()) {
-    case mlir::StandardTypes::BF16:
-    case mlir::StandardTypes::F32:
-    case mlir::StandardTypes::F64:
-    case mlir::StandardTypes::Integer: {
-      auto* context = type.getContext();
-      mlir::emitError(mlir::UnknownLoc::get(context))
-          << "lowering should have been handled by primitive type lowering for "
-          << debugString(type);
-      break;
+  if (type.isBF16() || type.isF32() || type.isF64() ||
+      type.isa<mlir::IntegerType>()) {
+    auto* context = type.getContext();
+    mlir::emitError(mlir::UnknownLoc::get(context))
+        << "lowering should have been handled by primitive type lowering for "
+        << debugString(type);
+  } else if (auto v = type.dyn_cast<mlir::VectorType>()) {
+    llvm::SmallVector<int64, 4> span(v.getShape().begin(), v.getShape().end());
+    mlir::Type element_type = v.getElementType();
+    PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
+    if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
+      return ShapeUtil::MakeShape(primitive_type, span);
+  } else if (auto m = type.dyn_cast<mlir::MemRefType>()) {
+    llvm::SmallVector<int64, 6> span(m.getShape().begin(), m.getShape().end());
+    mlir::Type element_type = m.getElementType();
+    // Treat a memref of a vector as if it was a memref of primitive type with
+    // the vector dimensions at the end.
+    if (auto v = element_type.dyn_cast<mlir::VectorType>()) {
+      element_type = v.getElementType();
+      span.insert(span.end(), v.getShape().begin(), v.getShape().end());
     }
-    case mlir::StandardTypes::Vector: {
-      const auto v = type.cast<VectorType>();
-      llvm::SmallVector<int64, 4> span(v.getShape().begin(),
-                                       v.getShape().end());
-      mlir::Type element_type = v.getElementType();
-      PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
-      if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
-        return ShapeUtil::MakeShape(primitive_type, span);
-      break;
-    }
-    case mlir::StandardTypes::MemRef: {
-      const auto m = type.cast<MemRefType>();
-      llvm::SmallVector<int64, 6> span(m.getShape().begin(),
-                                       m.getShape().end());
-      mlir::Type element_type = m.getElementType();
-      // Treat a memref of a vector as if it was a memref of primitive type with
-      // the vector dimensions at the end.
-      if (auto v = element_type.dyn_cast<VectorType>()) {
-        element_type = v.getElementType();
-        span.insert(span.end(), v.getShape().begin(), v.getShape().end());
+    PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
+    if (primitive_type == PrimitiveType::PRIMITIVE_TYPE_INVALID) return {};
+    // For the primitive type case, the shape of the memref is similar to the
+    // vector type case (i.e., it is, modulo the layout, the same dimensions
+    // and primitive type).
+    if (m.getAffineMaps().empty())
+      return ShapeUtil::MakeShape(primitive_type, span);
+
+    if (m.getAffineMaps().size() == 1) {
+      llvm::SmallVector<int64_t, 4> strides;
+      int64_t offset;
+      if (failed(mlir::getStridesAndOffset(m, strides, offset))) return {};
+
+      llvm::SmallVector<std::pair<int64_t, int>, 4> strides_with_indices;
+      for (const auto& e : llvm::enumerate(strides)) {
+        strides_with_indices.push_back({e.value(), e.index()});
       }
-      PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
-      if (primitive_type == PrimitiveType::PRIMITIVE_TYPE_INVALID) break;
-      // For the primitive type case, the shape of the memref is similar to the
-      // vector type case (i.e., it is, modulo the layout, the same dimensions
-      // and primitive type).
-      if (m.getAffineMaps().empty())
-        return ShapeUtil::MakeShape(primitive_type, span);
+      std::sort(strides_with_indices.begin(), strides_with_indices.end());
 
-      if (m.getAffineMaps().size() == 1) {
-        llvm::SmallVector<int64_t, 4> strides;
-        int64_t offset;
-        if (failed(mlir::getStridesAndOffset(m, strides, offset))) return {};
+      llvm::SmallVector<int64, 4> minor_to_major;
+      int64_t stride = 1;
+      for (const auto& pr : strides_with_indices) {
+        minor_to_major.push_back(pr.second);
 
-        llvm::SmallVector<std::pair<int64_t, int>, 4> strides_with_indices;
-        for (const auto& e : llvm::enumerate(strides)) {
-          strides_with_indices.push_back({e.value(), e.index()});
-        }
-        std::sort(strides_with_indices.begin(), strides_with_indices.end());
+        // Either the affine map is not perfectly strided, or the dimensions
+        // recovered from strides don't match the actual dimensions in shapes.
+        if (stride != pr.first) return {};
 
-        llvm::SmallVector<int64, 4> minor_to_major;
-        int64_t stride = 1;
-        for (const auto& pr : strides_with_indices) {
-          minor_to_major.push_back(pr.second);
-
-          // Either the affine map is not perfectly strided, or the dimensions
-          // recovered from strides don't match the actual dimensions in shapes.
-          if (stride != pr.first) return {};
-
-          stride *= m.getShape()[pr.second];
-        }
-
-        llvm::SmallVector<int64, 4> dimensions(m.getShape().begin(),
-                                               m.getShape().end());
-        return ::xla::ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions,
-                                                     minor_to_major);
+        stride *= m.getShape()[pr.second];
       }
-      break;
+
+      llvm::SmallVector<int64, 4> dimensions(m.getShape().begin(),
+                                             m.getShape().end());
+      return ::xla::ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions,
+                                                   minor_to_major);
     }
-    case mlir::StandardTypes::RankedTensor: {
-      // TODO(jpienaar): This is only handling the base case with primitive
-      // element type.
-      const auto t = type.cast<RankedTensorType>();
-      llvm::SmallVector<int64, 4> span(t.getShape().begin(),
-                                       t.getShape().end());
-      // Only fully static shapes are supported.
-      // TODO(b/115638799): Update once xla::Shape can support dynamic shapes.
-      if (std::find(t.getShape().begin(), t.getShape().end(), -1) !=
-          t.getShape().end())
-        break;
-      mlir::Type element_type = t.getElementType();
-      PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
-      // Only primitive element type supported.
-      if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
-        return ShapeUtil::MakeShape(primitive_type, span);
-      break;
+  } else if (auto t = type.dyn_cast<mlir::RankedTensorType>()) {
+    // TODO(jpienaar): This is only handling the base case with primitive
+    // element type.
+    llvm::SmallVector<int64, 4> span(t.getShape().begin(), t.getShape().end());
+    // Only fully static shapes are supported.
+    // TODO(b/115638799): Update once xla::Shape can support dynamic shapes.
+    if (std::find(t.getShape().begin(), t.getShape().end(), -1) !=
+        t.getShape().end())
+      return {};
+    mlir::Type element_type = t.getElementType();
+    PrimitiveType primitive_type = TypeToPrimitiveType(element_type);
+    // Only primitive element type supported.
+    if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
+      return ShapeUtil::MakeShape(primitive_type, span);
+  } else if (auto tuple_type = type.dyn_cast<mlir::TupleType>()) {
+    llvm::SmallVector<Shape, 4> shapes;
+    shapes.reserve(tuple_type.size());
+    for (mlir::Type sub_type : tuple_type.getTypes()) {
+      shapes.push_back(TypeToShape(sub_type));
     }
-    case mlir::StandardTypes::Tuple: {
-      const auto t = type.cast<mlir::TupleType>();
-      llvm::SmallVector<Shape, 4> shapes;
-      shapes.reserve(t.size());
-      for (mlir::Type sub_type : t.getTypes()) {
-        shapes.push_back(TypeToShape(sub_type));
-      }
-      return ShapeUtil::MakeTupleShape(shapes);
-    }
-    case mlir::mhlo::HLOTypes::Token:
-      return ShapeUtil::MakeTokenShape();
-    default:
-      break;
+    return ShapeUtil::MakeTupleShape(shapes);
+
+  } else if (type.isa<mlir::mhlo::TokenType>()) {
+    return ShapeUtil::MakeTokenShape();
   }
+
   // Return empty XLA shape to signify error. No MLIR Type maps to a empty
   // Shape.
   return {};
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
index a4a2bc42d99..ce709b10462 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
@@ -64,6 +64,7 @@ inline ::testing::PolymorphicMatcher<ProtoStringMatcher> EqualsProto(
 
 TEST(TypeToShapeTest, ConvertPrimitiveTypes) {
   MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   Builder b(&context);
 
   EXPECT_EQ(TypeToPrimitiveType(b.getF32Type()), PrimitiveType::F32);
@@ -74,6 +75,7 @@ TEST(TypeToShapeTest, ConvertPrimitiveTypes) {
 
 TEST(TypeToShapeTest, ConvertBasicTypesToTypes) {
   MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   Builder b(&context);
 
   EXPECT_TRUE(
@@ -95,6 +97,7 @@ TEST(TypeToShapeTest, ConvertBasicTypesToTypes) {
 
 TEST(TypeToShapeTest, ConvertMemRefTypeToTypes) {
   MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   Builder b(&context);
 
   // Memref without any affine map. Note: memory space is ignored for shape.
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 7f099540f39..30b8a7e5561 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -265,6 +265,7 @@ tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -283,6 +284,7 @@ tf_xla_py_test(
     name = "cholesky_op_test",
     size = "medium",
     srcs = ["cholesky_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -347,6 +349,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["searchsorted_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -389,6 +392,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_inverse_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -411,6 +415,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_solve_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -429,6 +434,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_triangular_solve_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -469,7 +475,6 @@ tf_xla_py_test(
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
-        "many_xla_args",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
     ],
@@ -533,6 +538,7 @@ tf_xla_py_test(
     name = "depthwise_conv_op_test",
     size = "medium",
     srcs = ["depthwise_conv_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -632,6 +638,7 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -688,6 +695,7 @@ tf_xla_py_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 6,
     tags = [
@@ -783,6 +791,7 @@ tf_xla_py_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -821,6 +830,7 @@ tf_xla_py_test(
     name = "manip_ops_test",
     size = "small",
     srcs = ["manip_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -928,6 +938,7 @@ tf_xla_py_test(
     name = "pooling_ops_test",
     size = "medium",
     srcs = ["pooling_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 20,
     tags = [
@@ -1006,6 +1017,7 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1032,6 +1044,7 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1114,6 +1127,7 @@ tf_xla_py_test(
     name = "reverse_ops_test",
     size = "medium",
     srcs = ["reverse_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1165,6 +1179,7 @@ tf_xla_py_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1183,6 +1198,7 @@ tf_xla_py_test(
     name = "segment_reduction_ops_test",
     size = "medium",
     srcs = ["segment_reduction_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1281,6 +1297,7 @@ tf_xla_py_test(
     name = "stateless_random_ops_test",
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1564,6 +1581,7 @@ tf_xla_py_test(
     name = "xla_device_test",
     size = "small",
     srcs = ["xla_device_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1886,6 +1904,7 @@ tf_xla_py_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
+    enable_mlir_bridge = True,
     shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index 0202c582ef3..9d278cfbb28 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -135,6 +135,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
     self._VerifyTriangularSolve(
         a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testNonSquareCoefficientMatrix(self):
     rng = np.random.RandomState(0)
     for dtype in self.float_types:
@@ -145,6 +146,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
           linalg_ops.matrix_triangular_solve(a, b)
 
   @test_util.run_v2_only  # Different error types
+  @test_util.disable_mlir_bridge("Error handling")
   def testWrongDimensionsV2(self):
     randn = np.random.RandomState(0).randn
     for dtype in self.float_types:
@@ -156,6 +158,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
         linalg_ops.matrix_triangular_solve(lhs, rhs)
 
   @test_util.run_v1_only("Different error types")
+  @test_util.disable_mlir_bridge("Error handling")
   def testWrongDimensionsV1(self):
     randn = np.random.RandomState(0).randn
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 9f963110cf3..0f19affc8e3 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -63,9 +63,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index 7c36f8b13ca..440b7672d98 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -129,6 +130,7 @@ class CumsumTest(xla_test.XLATestCase):
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     with self.session(), self.test_scope():
@@ -207,6 +209,7 @@ class CumprodTest(xla_test.XLATestCase):
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.disable_mlir_bridge("Error handling")
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     with self.session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 7bbfecff403..4109fdc64a5 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -214,7 +214,6 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
             upper,
             expected=np.minimum(np.maximum(x, lower), upper))
 
-  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetaincSanity(self):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
@@ -252,7 +251,6 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           'atol': 2e-4
       },
   )
-  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetainc(self, sigma, rtol, atol):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index eb022da6895..b5f82bcff12 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -96,7 +96,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
     self.assertAllEqual(result, expected)
 
   @test_util.disable_mlir_bridge(
-      "MlirHloBuilder::Iota missing required for xla::Diag")
+      "Handle complex element type in DiagPart lowering")
   def testAllTypeOps(self):
     for dtype in self.numeric_types - {np.int8, np.uint8}:
       self._assertOpOutputMatchesExpected(
@@ -538,8 +538,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
             np.array([-40, 40], dtype=dtype),
             expected=np.array([1.0, 0.025], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.QuantizeAndDequantize compilation")
   def testQuantizeAndDequantize(self):
     for dtype in self.float_types:
 
@@ -1070,8 +1068,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
         ],
         equality_test=self.ListsAreClose)
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.DepthToSpace compilation")
   def testDepthToSpace(self):
 
     def make_op(data_format):
@@ -1118,14 +1114,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           make_op("NCHW_VECT_C"),
           np.arange(32, dtype=dtype).reshape((1, 8, 1, 1, 4)),
-          expected=np.array([[[[[0, 1], [8, 9]], [[16, 17], [24, 25]]],
-                              [[[2, 3], [10, 11]], [[18, 19], [26, 27]]],
-                              [[[4, 5], [12, 13]], [[20, 21], [28, 29]]],
-                              [[[6, 7], [14, 15]], [[22, 23], [30, 31]]]]],
+          expected=np.array([[[[[0, 1, 2, 3], [8, 9, 10, 11]],
+                               [[16, 17, 18, 19], [24, 25, 26, 27]]],
+                              [[[4, 5, 6, 7], [12, 13, 14, 15]],
+                               [[20, 21, 22, 23], [28, 29, 30, 31]]]]],
                             dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.SpaceToDepth compilation")
   def testSpaceToDepth(self):
 
     def make_op(data_format):
@@ -1172,11 +1166,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           make_op("NCHW_VECT_C"),
           np.arange(32, dtype=dtype).reshape((1, 2, 2, 2, 4)),
-          expected=np.array([[[[[0, 1, 2, 3, 16, 17, 18, 19]]],
-                              [[[4, 5, 6, 7, 20, 21, 22, 23]]],
-                              [[[8, 9, 10, 11, 24, 25, 26, 27]]],
-                              [[[12, 13, 14, 15, 28, 29, 30, 31]]]]],
-                            dtype=dtype))
+          expected=np.array(
+              [[[[[0, 1, 2, 3]]], [[[16, 17, 18, 19]]], [[[4, 5, 6, 7]]],
+                [[[20, 21, 22, 23]]], [[[8, 9, 10, 11]]], [[[24, 25, 26, 27]]],
+                [[[12, 13, 14, 15]]], [[[28, 29, 30, 31]]]]],
+              dtype=dtype))
 
   def _assertSoftplusMatchesExpected(self,
                                      features,
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 0718bd8cd65..44fb5513886 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -11,7 +11,6 @@ load(
     "tf_custom_op_library_additional_deps",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
-    "tf_gpu_kernel_library",
 )
 
 # buildifier: disable=same-origin-load
@@ -81,6 +80,7 @@ tf_cuda_cc_test(
 
 cc_library(
     name = "common_utils",
+    srcs = ["common/utils.cc"],
     hdrs = ["common/utils.h"],
     copts = tf_copts(),
     deps = [
@@ -539,20 +539,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_gpu_kernel_library(
-    name = "plugin_cast",
-    srcs = ["plugin/plugin_cast.cu.cc"],
-    deps = [
-        ":trt_plugins",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core:framework_lite",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:tensorrt",
-    ]),
-)
-
 tf_cuda_library(
     name = "trt_plugins",
     srcs = ["plugin/trt_plugin.cc"],
@@ -602,6 +588,7 @@ pybind_extension(
     link_in_framework = True,
     module_name = "_pywrap_py_utils",
     deps = [
+        ":common_utils",
         ":py_utils",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:logging",
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc
new file mode 100644
index 00000000000..6679ca04513
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "absl/base/call_once.h"
+#include "absl/strings/str_join.h"
+#include "third_party/tensorrt/NvInferPlugin.h"
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::tuple<int, int, int> GetLinkedTensorRTVersion() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  return std::tuple<int, int, int>{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR,
+                                   NV_TENSORRT_PATCH};
+#else
+  return std::tuple<int, int, int>{0, 0, 0};
+#endif
+}
+
+std::tuple<int, int, int> GetLoadedTensorRTVersion() {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  int ver = getInferLibVersion();
+  int major = ver / 1000;
+  ver = ver - major * 1000;
+  int minor = ver / 100;
+  int patch = ver - minor * 100;
+  return std::tuple<int, int, int>{major, minor, patch};
+#else
+  return std::tuple<int, int, int>{0, 0, 0};
+#endif
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+namespace {
+
+void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
+  LOG(INFO) << "Linked TensorRT version: "
+            << absl::StrJoin(GetLinkedTensorRTVersion(), ".");
+  LOG(INFO) << "Loaded TensorRT version: "
+            << absl::StrJoin(GetLoadedTensorRTVersion(), ".");
+
+  bool plugin_initialized = initLibNvInferPlugins(trt_logger, "");
+  if (!plugin_initialized) {
+    LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may "
+                  "fail later.";
+  }
+
+  int num_trt_plugins = 0;
+  nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
+      getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
+  if (!trt_plugin_creator_list) {
+    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
+  } else {
+    VLOG(1) << "Found the following " << num_trt_plugins
+            << " TensorRT plugins in registry:";
+    for (int i = 0; i < num_trt_plugins; ++i) {
+      if (!trt_plugin_creator_list[i]) {
+        LOG_WARNING_WITH_PREFIX
+            << "TensorRT plugin at index " << i
+            << " is not accessible (null pointer returned by "
+               "getPluginCreatorList for this plugin)";
+      } else {
+        VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
+      }
+    }
+  }
+}
+
+}  // namespace
+
+void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
+  static absl::once_flag once;
+  absl::call_once(once, InitializeTrtPlugins, trt_logger);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
index b428733ecd4..b76b75de783 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -16,15 +16,33 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
 
+#include <tuple>
+
+namespace tensorflow {
+namespace tensorrt {
+// Returns the compile time TensorRT library version information
+// {Maj, Min, Patch}.
+std::tuple<int, int, int> GetLinkedTensorRTVersion();
+
+// Returns the runtime time TensorRT library version information
+// {Maj, Min, Patch}.
+std::tuple<int, int, int> GetLoadedTensorRTVersion();
+}  // namespace tensorrt
+}  // namespace tensorflow
+
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/platform/logging.h"
+#include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 
 #define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
 
+// Initializes the TensorRT plugin registry if this hasn't been done yet.
+void MaybeInitializeTrtPlugins(nvinfer1::ILogger* trt_logger);
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index f80c0f42eca..c0c3f25177e 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1197,42 +1197,6 @@ Status TrtNodeValidator::ConvertConstToWeights(
   return status;
 }
 
-static void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
-  static mutex plugin_mutex(LINKER_INITIALIZED);
-  static bool plugin_initialized = false;
-  mutex_lock lock(plugin_mutex);
-  if (plugin_initialized) return;
-
-  LOG(INFO) << "Linked TensorRT version: " << GetLinkedTensorRTVersion();
-  LOG(INFO) << "Loaded TensorRT version: " << GetLoadedTensorRTVersion();
-
-  plugin_initialized = initLibNvInferPlugins(trt_logger, "");
-  if (!plugin_initialized) {
-    LOG(ERROR) << "Failed to initialize TensorRT plugins, and conversion may "
-                  "fail later.";
-  }
-
-  int num_trt_plugins = 0;
-  nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
-      getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
-  if (!trt_plugin_creator_list) {
-    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
-  } else {
-    VLOG(1) << "Found the following " << num_trt_plugins
-            << " TensorRT plugins in registry:";
-    for (int i = 0; i < num_trt_plugins; ++i) {
-      if (!trt_plugin_creator_list[i]) {
-        LOG_WARNING_WITH_PREFIX
-            << "TensorRT plugin at index " << i
-            << " is not accessible (null pointer returned by "
-               "getPluginCreatorList for this plugin)";
-      } else {
-        VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
-      }
-    }
-  }
-}
-
 // static
 StatusOr<std::unique_ptr<Converter>> Converter::Create(
     TrtPrecisionMode precision_mode, bool use_calibration,
@@ -1249,7 +1213,7 @@ Converter::Converter(TrtPrecisionMode precision_mode, bool use_calibration,
     : precision_mode_(precision_mode),
       use_calibration_(use_calibration),
       use_implicit_batch_(use_implicit_batch) {
-  InitializeTrtPlugins(trt_logger);
+  MaybeInitializeTrtPlugins(trt_logger);
   this->RegisterOpConverters();
 }
 
@@ -1434,7 +1398,8 @@ Status Converter::BuildCudaEngine(
   TF_RETURN_IF_ERROR(
       TrtPrecisionModeToName(precision_mode_, &precision_mode_str));
   string trt_network_name = StrCat(
-      "TF:", TF_VERSION_STRING, ", ", "TRT:", GetLoadedTensorRTVersion(), "-",
+      "TF:", TF_VERSION_STRING, ", ",
+      "TRT:", absl::StrJoin(GetLoadedTensorRTVersion(), "."), "-",
       "Precision:", precision_mode_str, ", ", "Calibration:", use_calibration_,
       ", ", "Max-Batch-Size:", max_batch_size, ", ",
       "Max-Workspace-Size:", max_workspace_size_bytes);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index aeae44a5562..72348c3cede 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -5374,7 +5374,9 @@ TEST_P(OpConverterTest1, ConvertReduce) {
         expected_output_dims.erase(std::remove(expected_output_dims.begin(),
                                                expected_output_dims.end(), 0),
                                    expected_output_dims.end());
-        VLOG(2) << "out dims " << expected_output_dims;
+        VLOG(2) << "out dims "
+                << absl::StrCat("[", absl::StrJoin(expected_output_dims, ","),
+                                "]");
         std::vector<float> expected_values = CalcReduce(
             op.name, p.helper_array, p.stride, op.val_func, op.init_val);
         TestOpConverter("my_reduce", node_def, expected_output_dims,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index a69960005fc..1fc0d13c993 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -241,36 +241,6 @@ int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
 
 #endif
 
-string GetLinkedTensorRTVersion() {
-  int major, minor, patch;
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  major = NV_TENSORRT_MAJOR;
-  minor = NV_TENSORRT_MINOR;
-  patch = NV_TENSORRT_PATCH;
-#else
-  major = 0;
-  minor = 0;
-  patch = 0;
-#endif
-  return absl::StrCat(major, ".", minor, ".", patch);
-}
-
-string GetLoadedTensorRTVersion() {
-  int major, minor, patch;
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  int ver = getInferLibVersion();
-  major = ver / 1000;
-  ver = ver - major * 1000;
-  minor = ver / 100;
-  patch = ver - minor * 100;
-#else
-  major = 0;
-  minor = 0;
-  patch = 0;
-#endif
-  return absl::StrCat(major, ".", minor, ".", patch);
-}
-
 absl::string_view GetDeviceName(const Node* node) {
   if (node->has_assigned_device_name()) {
     return node->assigned_device_name();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index a0505c3f922..7570dff1c9d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -117,14 +117,6 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
 Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
 Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
 
-// Returns a string that includes compile time TensorRT library version
-// information {Maj, Min, Patch}.
-string GetLinkedTensorRTVersion();
-
-// Returns a string that includes runtime time TensorRT library version
-// information {Maj, Min, Patch}.
-string GetLoadedTensorRTVersion();
-
 // Returns true if an engine built for cached_shapes can also run actual_shapes.
 bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
                          const std::vector<TensorShape>& cached_shapes);
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 58d1c611463..5b2ae822d59 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -800,6 +800,9 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
 
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
     infer->setGpuAllocator(allocator);
+    // Need to initialize plugins in order to deserialize engines that contain
+    // plugins.
+    MaybeInitializeTrtPlugins(&logger);
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
                                      serialized_segment_.size(), nullptr));
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
deleted file mode 100644
index 141a7d1f462..00000000000
--- a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
-#include "tensorflow/core/platform/logging.h"
-
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-#define EIGEN_USE_GPU  // For definition of Eigen::GpuDevice.
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "tensorflow/core/util/gpu_kernel_helper.h"
-#include "third_party/tensorrt/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-using nvinfer1::DataType;
-using nvinfer1::Dims;
-using nvinfer1::IPluginCreator;
-using nvinfer1::IPluginV2;
-using nvinfer1::IPluginV2Ext;
-using nvinfer1::PluginField;
-using nvinfer1::PluginFieldCollection;
-using nvinfer1::PluginFieldType;
-using nvinfer1::PluginFormat;
-
-template <typename SrcT, typename DstT>
-__global__ void Cast(const SrcT* input, int num_elements, DstT* output) {
-  for (int i : CudaGridRangeX(num_elements)) {
-    output[i] = static_cast<DstT>(input[i]);
-  }
-}
-
-template <typename SrcT, typename DstT>
-void RunCast(const SrcT* d_input, int num_elements, DstT* d_output,
-             cudaStream_t stream) {
-  const int threads_per_block = 256;
-  const int blocks_per_grid =
-      (num_elements + threads_per_block - 1) / threads_per_block;
-  TF_CHECK_OK(CudaLaunchKernel(Cast<SrcT, DstT>, threads_per_block,
-                               blocks_per_grid, 0, stream, d_input,
-                               num_elements, d_output));
-}
-
-const char* kPluginName = "TfTrtPluginCast";
-
-class CastPlugin : public TrtPlugin {
- public:
-  CastPlugin(DataType src_type, DataType dst_type)
-      : src_type_(src_type), dst_type_(dst_type) {}
-
-  CastPlugin(const void* serialized_data, size_t length)
-      : TrtPlugin(serialized_data, length) {
-    const char* buffer = static_cast<const char*>(serialized_data);
-    src_type_ = ReadFromBuffer<DataType>(&buffer);
-    dst_type_ = ReadFromBuffer<DataType>(&buffer);
-    src_dims_ = ReadFromBuffer<Dims>(&buffer);
-  }
-
-  CastPlugin(const CastPlugin& rhs)
-      : TrtPlugin(rhs),
-        src_type_(rhs.src_type_),
-        dst_type_(rhs.dst_type_),
-        src_dims_(rhs.src_dims_) {}
-
-  // Methods from IPluginV2Ext.
-
-  DataType getOutputDataType(int index, const DataType* input_types,
-                             int num_inputs) const override {
-    DCHECK_EQ(0, index);
-    DCHECK_EQ(1, num_inputs);
-    return dst_type_;
-  }
-
-  bool isOutputBroadcastAcrossBatch(int output_index,
-                                    const bool* input_is_broadcasted,
-                                    int num_inputs) const override {
-    return false;
-  }
-
-  bool canBroadcastInputAcrossBatch(int input_index) const override {
-    return false;
-  }
-
-  void configurePlugin(const Dims* input_dims, int num_inputs,
-                       const Dims* output_dims, int num_outputs,
-                       const DataType* input_types,
-                       const DataType* output_types,
-                       const bool* input_is_broadcast,
-                       const bool* output_is_broadcast,
-                       PluginFormat float_format, int max_batch_size) override {
-    DCHECK_EQ(1, num_inputs);
-    DCHECK_EQ(1, num_outputs);
-    DCHECK(src_type_ == input_types[0]);
-    DCHECK(dst_type_ == output_types[0]);
-    src_dims_ = input_dims[0];
-  }
-
-  IPluginV2Ext* clone() const override { return new CastPlugin(*this); }
-
-  // Methods from IPluginV2.
-
-  const char* getPluginType() const override { return kPluginName; };
-
-  const char* getPluginVersion() const override { return kTfTrtPluginVersion; };
-
-  int getNbOutputs() const override { return 1; }
-
-  Dims getOutputDimensions(int index, const Dims* inputs,
-                           int num_input_dims) override {
-    DCHECK_EQ(0, index);
-    DCHECK_EQ(1, num_input_dims);
-    return inputs[0];
-  }
-
-  bool supportsFormat(DataType type, PluginFormat format) const override {
-    return type == DataType::kFLOAT || type == DataType::kINT32;
-  }
-
-  size_t getWorkspaceSize(int max_batch_size) const override { return 0; }
-
-  int enqueue(int batch_size, const void* const* inputs, void** outputs, void*,
-              cudaStream_t stream) override {
-    int num_elements = batch_size;
-    for (int i = 0; i < src_dims_.nbDims; i++) {
-      num_elements *= src_dims_.d[i];
-    }
-    const void* input = inputs[0];
-    void* output = outputs[0];
-    DCHECK_NE(static_cast<int>(src_type_), static_cast<int>(dst_type_));
-
-    switch (src_type_) {
-      case DataType::kFLOAT:
-        RunCast(reinterpret_cast<const float*>(input), num_elements,
-                reinterpret_cast<int32*>(output), stream);
-        break;
-      case DataType::kINT32:
-        RunCast(reinterpret_cast<const int32*>(input), num_elements,
-                reinterpret_cast<float*>(output), stream);
-        break;
-      default:
-        return 1;  // Indicates a failure.
-    }
-    return 0;
-  }
-
-  size_t getSerializationSize() const override {
-    return 2 * sizeof(DataType) + sizeof(Dims);
-  }
-
-  void serialize(void* serialized_data) const override {
-    char* buffer = static_cast<char*>(serialized_data);
-    WriteToBuffer(src_type_, &buffer);
-    WriteToBuffer(dst_type_, &buffer);
-    WriteToBuffer(src_dims_, &buffer);
-  }
-
- private:
-  DataType src_type_;
-  DataType dst_type_;
-  Dims src_dims_;
-};
-
-class CastPluginCreator : public IPluginCreator {
- public:
-  CastPluginCreator() {
-    setPluginNamespace(kTfTrtPluginNamespace);
-    plugin_fields_.emplace_back(
-        PluginField("SrcT", nullptr, PluginFieldType::kINT32, 1));
-    plugin_fields_.emplace_back(
-        PluginField("DstT", nullptr, PluginFieldType::kINT32, 1));
-
-    field_collection_.nbFields = plugin_fields_.size();
-    field_collection_.fields = plugin_fields_.data();
-  }
-
-  const char* getPluginName() const override { return kPluginName; }
-
-  const char* getPluginVersion() const override { return kTfTrtPluginVersion; }
-
-  const PluginFieldCollection* getFieldNames() override {
-    return &field_collection_;
-  }
-
-  IPluginV2* createPlugin(
-      const char* name,
-      const PluginFieldCollection* field_collection) override {
-    const PluginField* fields = field_collection->fields;
-    DataType src_type, dst_type;
-    for (int i = 0; i < field_collection->nbFields; ++i) {
-      const char* attr_name = fields[i].name;
-      if (!strcmp(attr_name, "SrcT")) {
-        src_type = *static_cast<const DataType*>(fields[i].data);
-      } else if (!strcmp(attr_name, "DstT")) {
-        dst_type = *static_cast<const DataType*>(fields[i].data);
-      } else {
-        return nullptr;
-      }
-    }
-    return new CastPlugin(src_type, dst_type);
-  }
-
-  IPluginV2* deserializePlugin(const char* name, const void* serial_data,
-                               size_t serial_len) override {
-    return new CastPlugin(serial_data, serial_len);
-  }
-
-  void setPluginNamespace(const char* plugin_namespace) override {
-    namespace_ = plugin_namespace;
-  }
-
-  const char* getPluginNamespace() const override { return namespace_.c_str(); }
-
- private:
-  PluginFieldCollection field_collection_;
-  std::vector<PluginField> plugin_fields_;
-  std::string namespace_;
-};
-
-REGISTER_TFTRT_PLUGIN(CastPluginCreator);
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index a8e24aa8983..3f8a11f7410 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -41,31 +41,5 @@ bool IsGoogleTensorRTEnabled() {
 #endif
 }
 
-void GetLinkedTensorRTVersion(int* major, int* minor, int* patch) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  *major = NV_TENSORRT_MAJOR;
-  *minor = NV_TENSORRT_MINOR;
-  *patch = NV_TENSORRT_PATCH;
-#else
-  *major = 0;
-  *minor = 0;
-  *patch = 0;
-#endif
-}
-
-void GetLoadedTensorRTVersion(int* major, int* minor, int* patch) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  int ver = getInferLibVersion();
-  *major = ver / 1000;
-  ver = ver - *major * 1000;
-  *minor = ver / 100;
-  *patch = ver - *minor * 100;
-#else
-  *major = 0;
-  *minor = 0;
-  *patch = 0;
-#endif
-}
-
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
index f52bb6f1bad..9b24eb36cf9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.h
@@ -21,12 +21,6 @@ namespace tensorrt {
 
 bool IsGoogleTensorRTEnabled();
 
-// Return compile time TensorRT library version information {Maj, Min, Patch}.
-void GetLinkedTensorRTVersion(int* major, int* minor, int* patch);
-
-// Return runtime time TensorRT library version information {Maj, Min, Patch}.
-void GetLoadedTensorRTVersion(int* major, int* minor, int* patch);
-
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
index 03f77c6bd5f..52252f125ac 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
@@ -16,18 +16,15 @@ limitations under the License.
 #include <tuple>
 
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
 std::tuple<int, int, int> get_linked_tensorrt_version() {
-  int major, minor, patch;
-  tensorflow::tensorrt::GetLinkedTensorRTVersion(&major, &minor, &patch);
-  return std::tuple<int, int, int>{major, minor, patch};
+  return tensorflow::tensorrt::GetLinkedTensorRTVersion();
 }
 
 std::tuple<int, int, int> get_loaded_tensorrt_version() {
-  int major, minor, patch;
-  tensorflow::tensorrt::GetLoadedTensorRTVersion(&major, &minor, &patch);
-  return std::tuple<int, int, int>{major, minor, patch};
+  return tensorflow::tensorrt::GetLoadedTensorRTVersion();
 }
 
 PYBIND11_MODULE(_pywrap_py_utils, m) {
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index ac999d875de..e9bcbcc6d83 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -337,7 +337,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":common",
-        ":frontend_attributes_util",
         ":host_compute_metadata_proto_cc",
         ":rearrange_function_argument",
         ":sharding_util",
@@ -353,23 +352,16 @@ cc_library(
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
-        "//tensorflow/compiler/jit:xla_cluster_util",
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
-        "//tensorflow/compiler/tf2xla/lib:util",
-        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -378,11 +370,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
     ],
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 10b26f9801c..596fa8e8e38 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -46,12 +46,254 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Helper functions for functionalizing control flow in functions.
+
+// Maps function name to
+// - new function name, if the function body was functionalized
+// - absl::nullopt, if not
+using FuncMap = std::map<string, absl::optional<string>>;
+using FuncMapIter = std::map<string, absl::optional<string>>::const_iterator;
+
+// Returns whether function has been processed before.
+bool FunctionHasBeenProcessed(FuncMapIter func_iter, const FuncMap* func_map) {
+  return func_iter != func_map->end();
+}
+
+// Returns whether function has been modified (i.e., functionalized) before.
+bool FunctionHasBeenModified(FuncMapIter func_iter) {
+  return func_iter->second.has_value();
+}
+
+// Returns a name for the new functionalized version of a function.
+string GetNewFunctionName(
+    const string& func_name, Node* n,
+    AssociatedFunctionInfo::AssociatedFunctionType func_type,
+    FunctionLibraryDefinition* fld) {
+  // For SymbolicGradient, `func_name` is always "SymbolicGradient" which
+  // is not very informative. Use node name instead.
+  return (
+      func_type ==
+              AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient
+          ? fld->UniqueFunctionName(absl::StrCat(n->name(), "_f15n_"))
+          : fld->UniqueFunctionName(absl::StrCat(func_name, "_f15n_")));
+}
+
+// Returns name to which a modified function has been mapped.
+const string& GetMappedFunctionName(FuncMapIter func_iter) {
+  DCHECK(func_iter->second.has_value());
+  return func_iter->second.value();
+}
+
+// Updates `func_map` with function given by `canonicalized_name`.
+void UpdateFunctionMap(FuncMap* func_map, const string& canonicalized_name,
+                       const string& new_func_name, bool function_modified) {
+  // If function was modified store its new name, otherwise add empty entry to
+  // record that function has been processed and does not need to be rewritten.
+  (*func_map)[canonicalized_name] =
+      function_modified ? absl::make_optional(new_func_name) : absl::nullopt;
+}
+
+// Adds new function def to graph's function library if necessary.
+Status AddFunctionDefToGraphLibrary(
+    const string& func_name, const AssociatedFunctionInfo& associated_function,
+    Graph* graph, FunctionLibraryDefinition* fld) {
+  const OpRegistrationData* op_reg_data;
+  // We have to be careful with adding the function def since there are three
+  // different `OpRegistryInterface`s involved here:
+  // `fld`, `graph->flib_def()` and `graph->flib_def().default_registry()`.
+  // We have already added the function def to `fld` before calling this
+  // function but for the subsequent `RewriteAssociatedFunction` call we need
+  // the function def to be in one of the other two registries, otherwise
+  // `RewriteAssociatedFunction` will fail for the `kFunctionCallNode` case
+  // because it cannot find the associated function def.
+  // On the other hand, we should not add the function def if it is already
+  // contained in one of the last two registries, this would lead to errors when
+  // the function def is already in one registry and we try to add it to the
+  // other one (if we try to add it to the same it's fine). This can happen in
+  // cases where one of the last two registries is identical to `fld` (which we
+  // already updated).
+  // Therefore, before adding the function def we have to check if it's already
+  // contained in either `graph->flib_def()` or
+  // `graph->flib_def().default_registry()` which is done in the following line
+  // (we have to use `LookUp` instead of `Contains` or `Find` because the latter
+  // both don't check the default registry).
+  if (graph->flib_def().LookUp(func_name, &op_reg_data).ok())
+    return Status::OK();
+
+  const FunctionDef* new_fdef = fld->Find(func_name);
+  DCHECK(new_fdef != nullptr);
+  FunctionDefLibrary fdef_lib;
+  *(fdef_lib.add_function()) = *new_fdef;
+  return graph->AddFunctionLibrary(fdef_lib);
+}
+
+// Functionalizes function given by `func_name`. Update `func_map` accordingly.
+Status FunctionalizeControlFlowForFunction(
+    const string& func_name, const string& new_func_name,
+    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
+    FuncMap* func_map, bool* function_modified,
+    const NodeFilter& node_filter = {});
+
+// Functionalizes all functions that are (directly or indirectly) associated to
+// any node in `graph`. Adds processed functions to `func_map`.
+Status FunctionalizeControlFlowForNodeAssociatedFunctions(
+    FuncMap* func_map, Graph* graph, FunctionLibraryDefinition* fld,
+    FunctionLibraryRuntime* flr, bool* any_function_modified,
+    const NodeFilter& node_filter) {
+  std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
+      nodes_to_associated_functions;
+  for (auto* n : graph->nodes()) {
+    auto associated_functions = GetAssociatedFunctions(*n, fld);
+    if (!associated_functions.empty()) {
+      nodes_to_associated_functions.push_back({n, associated_functions});
+    }
+  }
+  for (const auto& pair : nodes_to_associated_functions) {
+    Node* n = pair.first;
+    auto associated_functions = pair.second;
+    for (auto& associated_function : associated_functions) {
+      // Note that if `n` is a function call node, then potential calls of
+      // `RewriteAssociatedFunction` below might delete `n` and create a new
+      // node instead, making `n` an invalid pointer. That's fine because in
+      // that case `n` only has one associated function, so this loop has only
+      // one iteration and we don't use `n` again after the rewrite.
+      // The invariant is guaranteed by `GetAssociatedFunctions` and confirmed
+      // below.
+      DCHECK(associated_function.type() !=
+                 AssociatedFunctionInfo::kFunctionCallNode ||
+             associated_functions.size() == 1);
+
+      // Process one node-function-pair.
+      string func_name = associated_function.func_name();
+      string canonicalized_name =
+          Canonicalize(func_name, AttrSlice(&associated_function.attrs()));
+      auto func_iter = func_map->find(canonicalized_name);
+      string new_func_name;
+      if (FunctionHasBeenProcessed(func_iter, func_map)) {
+        if (FunctionHasBeenModified(func_iter)) {
+          *any_function_modified = true;
+          new_func_name = GetMappedFunctionName(func_iter);
+          TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+              graph, n, fld, associated_function, new_func_name));
+        }
+        continue;
+      }
+      // Function is processed for the first time.
+      bool function_modified = false;
+      new_func_name =
+          GetNewFunctionName(func_name, n, associated_function.type(), fld);
+      // Perform functionalization for current function.
+      TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
+          func_name, new_func_name, associated_function.attrs(), fld, flr,
+          func_map, &function_modified, node_filter));
+      UpdateFunctionMap(func_map, canonicalized_name, new_func_name,
+                        function_modified);
+      if (function_modified) {
+        *any_function_modified = true;
+        TF_RETURN_IF_ERROR(AddFunctionDefToGraphLibrary(
+            new_func_name, associated_function, graph, fld));
+        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
+            graph, n, fld, associated_function, new_func_name));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeControlFlowForFunction(
+    const string& func_name, const string& new_func_name,
+    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
+    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
+    FuncMap* func_map, bool* function_modified, const NodeFilter& node_filter) {
+  *function_modified = false;
+
+  // Convert the function to a graph.
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
+  Status ret_status = Status::OK();
+  auto cleanup_handle = gtl::MakeCleanup([&]() {
+    auto s = flr->ReleaseHandle(handle);
+    if (!s.ok()) {
+      ret_status.Update(s);
+    }
+  });
+  const FunctionBody* body = flr->GetFunctionBody(handle);
+  Graph* g = body->graph;
+
+  // Check if the graph has Switch or Merge node.
+  bool has_switch_or_merge = false;
+  for (Node* n : body->graph->nodes()) {
+    // Skip nodes that are filtered out.
+    if (node_filter && !node_filter(n)) continue;
+    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
+      has_switch_or_merge = true;
+      break;
+    }
+  }
+  // Before functionalizing control flow in `g` we functionalize control flow
+  // in functions (directly or indirectly) associated with nodes in `g`.
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlowForNodeAssociatedFunctions(
+      func_map, g, fld, flr, function_modified, node_filter));
+
+  if (has_switch_or_merge) {
+    *function_modified = true;
+
+    // Functionalize the function body.
+    if (VLOG_IS_ON(4)) {
+      DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
+          *g, fld);
+    }
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld, node_filter));
+    if (VLOG_IS_ON(4)) {
+      DumpGraphToFile(
+          absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
+          fld);
+    }
+  }
+  if (*function_modified) {
+    // Add rewritten FunctionDef into library.
+    FunctionDef functionalized_fdef;
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
+    if (func_name == new_func_name) {
+      VLOG(2) << "Replacing function " << func_name;
+      TF_RETURN_IF_ERROR(
+          fld->ReplaceFunction(new_func_name, functionalized_fdef));
+    } else {
+      VLOG(2) << "Adding function " << new_func_name;
+      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
+    }
+  }
+
+  return ret_status;
+}
+
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library,
-                                const NodeFilter& node_filter) {
+                                const NodeFilter& node_filter,
+                                bool include_functions) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
           << DumpGraphToFile("functionalize_initial", *graph, library);
 
+  if (include_functions) {
+    // Functionalize control flow in functions that are (directly or indirectly)
+    // associated with a node in `graph`.
+    auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        /*device_mgr=*/nullptr, tensorflow::Env::Default(),
+        /*config=*/nullptr, TF_GRAPH_DEF_VERSION, library,
+        tensorflow::OptimizerOptions());
+    // `pflr` has only one `FunctionLibraryRuntime`, for `kDefaultFLRDevice`
+    // (because we constructed it with `device_mgr = nullptr`).
+    FunctionLibraryRuntime* flr =
+        pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+    FuncMap func_map;
+    bool modified = false;
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlowForNodeAssociatedFunctions(
+        &func_map, graph, library, flr, &modified, node_filter));
+  }
   // Functionalize and remove while loops from graph.
   TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(graph, library, node_filter));
 
@@ -68,153 +310,19 @@ Status FunctionalizeControlFlow(Graph* graph,
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
                                            FunctionLibraryDefinition* library,
-                                           const NodeFilter& node_filter) {
+                                           const NodeFilter& node_filter,
+                                           bool include_functions) {
   FunctionDefLibrary function_lib = graph_def->library();
   Graph graph(OpRegistry::Global());
 
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
-  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(&graph, library, node_filter));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(&graph, library, node_filter,
+                                              include_functions));
   graph.ToGraphDef(graph_def);
   std::swap(*graph_def->mutable_library(), function_lib);
   return Status::OK();
 }
 
-Status FunctionalizeControlFlowForFunction(
-    const string& func_name, const string& new_func_name,
-    const protobuf::Map<string, tensorflow::AttrValue>& attrs,
-    FunctionLibraryDefinition* fld, FunctionLibraryRuntime* flr,
-    std::map<string, absl::optional<string>>* canonicalized_name_to_new_name,
-    bool* modified) {
-  *modified = false;
-
-  // Convert the function to Graph.
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
-  Status ret_status = Status::OK();
-  auto cleanup_handle = gtl::MakeCleanup([&]() {
-    auto s = flr->ReleaseHandle(handle);
-    if (!s.ok()) {
-      ret_status.Update(s);
-    }
-  });
-  const FunctionBody* body = flr->GetFunctionBody(handle);
-  Graph* g = body->graph;
-
-  // Check if the graph has Switch or Merge node.
-  bool has_switch_or_merge = false;
-  for (Node* n : body->graph->nodes()) {
-    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
-      has_switch_or_merge = true;
-      break;
-    }
-  }
-  // We cannot return here directly if the graph has no Switch/Merge.
-  // It might contain function call nodes, or If/While nodes with Switch/Merge
-  // in function body. We still need to rewrite those functions and modify
-  // corresponding nodes.
-
-  // If any node has associated functions, functionalize them first.
-  // Gather nodes with associated functions first, because rewriting those nodes
-  // might involve node deletion/addition. Avoid modifying nodes while iterating
-  // it.
-  std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
-      nodes_to_associated_functions;
-  for (auto* n : g->nodes()) {
-    auto associated_functions = GetAssociatedFunctions(*n, fld);
-    if (!associated_functions.empty()) {
-      nodes_to_associated_functions.push_back({n, associated_functions});
-    }
-  }
-  for (const auto& iter : nodes_to_associated_functions) {
-    Node* n = iter.first;
-    auto associated_functions = iter.second;
-    for (auto& associated_function : associated_functions) {
-      string name = associated_function.func_name();
-      string canonicalized_name =
-          Canonicalize(name, AttrSlice(&associated_function.attrs()));
-      auto iter = canonicalized_name_to_new_name->find(canonicalized_name);
-      string new_name;
-      bool function_modified;
-      if (iter != canonicalized_name_to_new_name->end()) {
-        // If we already processed this function, check if it was rewritten. If
-        // the function was rewritten, the entry will be non-empty. Otherwise
-        // the entry will be empty.
-        function_modified = iter->second.has_value();
-        if (function_modified) {
-          new_name = iter->second.value();
-        }
-      } else {
-        if (associated_function.type() ==
-            AssociatedFunctionInfo::AssociatedFunctionType::kSymbolicGradient) {
-          // For SymbolicGradient, `name` is always "SymbolicGradient",
-          // which is not very informative. Use node name instead.
-          new_name = fld->UniqueFunctionName(absl::StrCat(n->name(), "_f15n_"));
-        } else {
-          new_name = fld->UniqueFunctionName(absl::StrCat(name, "_f15n_"));
-        }
-        TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
-            name, new_name, associated_function.attrs(), fld, flr,
-            canonicalized_name_to_new_name, &function_modified));
-        if (function_modified) {
-          // If the function was rewritten, add an non-empty entry. So later we
-          // know we have processed this function, and it was rewritten into
-          // another function.
-          (*canonicalized_name_to_new_name)[canonicalized_name] = new_name;
-        } else {
-          // If the function was not rewritten, add an empty entry. So later
-          // we know we have processed this function, and it does not need to be
-          // rewritten.
-          (*canonicalized_name_to_new_name)[canonicalized_name] = absl::nullopt;
-        }
-      }
-      if (function_modified) {
-        *modified = true;
-
-        // Notice that if "n" is a function call, RewriteAssociatedFunction()
-        // will delete it and create a new node instead, making "n" an invalid
-        // pointer. That's fine because in that case, associated_functions will
-        // only have one member and the loop will only run once.
-        TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-            g, n, fld, associated_function, new_name));
-      }
-    }
-  }
-
-  if (has_switch_or_merge) {
-    *modified = true;
-
-    // Functionalize the function body.
-    if (VLOG_IS_ON(4)) {
-      DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-          *g, fld);
-    }
-    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld));
-    if (VLOG_IS_ON(4)) {
-      DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
-          fld);
-    }
-  }
-
-  if (*modified) {
-    // Add rewritten FunctionDef into library.
-    FunctionDef functionalized_fdef;
-    TF_RETURN_IF_ERROR(
-        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
-    if (func_name == new_func_name) {
-      VLOG(2) << "Replacing function " << func_name;
-      TF_RETURN_IF_ERROR(
-          fld->ReplaceFunction(new_func_name, functionalized_fdef));
-    } else {
-      VLOG(2) << "Adding function " << new_func_name;
-      TF_RETURN_IF_ERROR(fld->AddFunctionDef(functionalized_fdef));
-    }
-  }
-
-  return ret_status;
-}
-
 Status FunctionalizeControlFlowForXlaPass::Run(
     const GraphOptimizationPassOptions& options) {
   Graph* graph = options.graph->get();
@@ -241,7 +349,7 @@ Status FunctionalizeControlFlowForXlaPass::Run(
           // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
           {"XlaLaunch", "function"},
       };
-  std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
+  FuncMap func_map;
   bool fld_modified = false;
   for (Node* n : graph->nodes()) {
     auto it = kNodeTypeToFunctionAttrMapping->find(n->type_string());
@@ -258,7 +366,7 @@ Status FunctionalizeControlFlowForXlaPass::Run(
     bool modified;
     TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
         func.name(), new_func_name, func.attr(), options.flib_def, flr,
-        &canonicalized_name_to_new_name, &modified));
+        &func_map, &modified));
     if (modified) {
       n->ClearAttr(func_attr);
       func.set_name(new_func_name);
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index f9e751e2d67..46abae27878 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -30,6 +30,13 @@ namespace tensorflow {
 //
 // If `node_filter` is defined, then only loops and conditions for whose
 // nodes `node_filter` returns true are functionalized.
+
+// If `include_functions` is true, then loops and conditions inside of functions
+// that are associated with nodes in `graph` (e.g., a function called from a
+// node in `graph`) are also functionalized, otherwise they are not.
+// This also handles transitive cases, e.g., a function body will be
+// functionalized when it is called in another function that is called by some
+// node in `graph` (and so on). The node filter also applies here.
 //
 // Precondition:
 // For any node in a loop or condition for which `node_filter` returns true,
@@ -43,11 +50,13 @@ namespace tensorflow {
 // satisfies the above conditions.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library,
-                                const NodeFilter& node_filter = {});
+                                const NodeFilter& node_filter = {},
+                                bool include_functions = false);
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
                                            FunctionLibraryDefinition* library,
-                                           const NodeFilter& node_filter = {});
+                                           const NodeFilter& node_filter = {},
+                                           bool include_functions = false);
 
 // This pass looks at the graph, and turns V1 control flow structure
 // (Switch/Merge/etc.) into V2 control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 79a042ad680..951ebdd7ec1 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -27,12 +27,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -63,18 +66,41 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
 //     math_ops.less(y, x), lambda: math_ops.multiply(y, 17),
 //     lambda: math_ops.add(x, 23))
 //
-// Tests different node filters.
-class ConditionalTestFixture : public ::testing::TestWithParam<bool> {
+// Tests different node filters and functionalization inside of a function.
+class ConditionalTestFixture
+    : public ::testing::TestWithParam<std::tuple<bool, bool>> {
  protected:
-  void SetUp() override { restrict_to_tpu_nodes_ = GetParam(); }
+  void SetUp() override {
+    restrict_to_tpu_nodes_ = std::get<0>(GetParam());
+    wrap_condition_in_function_ = std::get<1>(GetParam());
+  }
   void RunTest();
 
  private:
+  void BuildCondGraph(Graph* cond_graph);
+  void CheckGraphDef(const GraphDef& graph_def,
+                     const FunctionLibraryDefinition& library);
+
   bool restrict_to_tpu_nodes_ = false;
+  bool wrap_condition_in_function_ = false;
 };
 
-void ConditionalTestFixture::RunTest() {
-  Graph graph(OpRegistry::Global());
+TEST_P(ConditionalTestFixture, ConditionalTests) { RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    FunctionalizeControlFlow, ConditionalTestFixture,
+    ::testing::Combine(::testing::Bool(), ::testing::Bool()),
+    [](const ::testing::TestParamInfo<ConditionalTestFixture::ParamType>&
+           info) {
+      bool restrict_to_tpu_nodes = std::get<0>(info.param);
+      bool wrap_cond_in_function = std::get<1>(info.param);
+      string name =
+          absl::StrCat(restrict_to_tpu_nodes ? "with_filter" : "without_filter",
+                       wrap_cond_in_function ? "_in_function" : "_in_graph");
+      return name;
+    });
+
+void ConditionalTestFixture::BuildCondGraph(Graph* cond_graph) {
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
 
@@ -102,13 +128,117 @@ void ConditionalTestFixture::RunTest() {
     auto merge = ops::Merge(scope.WithOpName("cond/Merge"),
                             std::initializer_list<Input>{add, mul});
 
-    TF_EXPECT_OK(scope.ToGraph(&graph));
+    TF_EXPECT_OK(scope.ToGraph(cond_graph));
 
     // Set `_tpu_replicate` attribute for all nodes.
-    for (Node* n : graph.nodes()) {
+    for (Node* n : cond_graph->nodes()) {
       n->AddAttr("_tpu_replicate", "cluster");
     }
   }
+}
+
+void ConditionalTestFixture::CheckGraphDef(
+    const GraphDef& graph_def, const FunctionLibraryDefinition& library) {
+  string op_name;
+  NameAttrList then_fn;
+  NameAttrList else_fn;
+  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+  InstantiationResultForTest else_result;
+  TF_EXPECT_OK(
+      InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+
+  // Outer graph
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
+    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
+    auto if_op =
+        ops::If(scope.WithOpName(op_name), less,
+                std::initializer_list<Input>{less, y, x}, {DT_INT32}, then_fn,
+                else_fn, ops::If::OutputShapes({PartialTensorShape()}));
+    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // then body.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
+    auto cond = ops::Const(
+        scope.WithOpName("cond").WithControlDependencies(identity), 17);
+    auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
+    auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), mul, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+
+  // else body.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
+    auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
+    auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
+    auto cond_1 = ops::Const(
+        scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
+    auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
+    auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
+
+    GraphDef expected;
+    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
+void ConditionalTestFixture::RunTest() {
+  Graph graph(OpRegistry::Global());
+  if (wrap_condition_in_function_) {
+    // Wrap condition in a function which is called from `graph`.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+
+    Graph cond_graph(OpRegistry::Global());
+    BuildCondGraph(&cond_graph);
+
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(cond_graph, "cond_fn", &cond_fdef));
+
+    FunctionDefLibrary fdef_lib;
+    *(fdef_lib.add_function()) = cond_fdef;
+    TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(fdef_lib));
+    NodeDef cond_fn;
+    cond_fn.set_name("cond_node");
+    cond_fn.set_op("cond_fn");
+    *(cond_fn.add_input()) = "source";
+    Status status;
+    scope.graph()->AddNode(cond_fn, &status);
+    TF_ASSERT_OK(status);
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  } else {
+    // Build condition in `graph`.
+    BuildCondGraph(&graph);
+  }
+  FunctionLibraryDefinition library(graph.flib_def());
   // If `restrict_to_tpu_nodes_` is true let filter function return true for
   // `_tpu_replicate` nodes.
   NodeFilter node_filter =
@@ -116,99 +246,47 @@ void ConditionalTestFixture::RunTest() {
           ? [](const Node* n) { return n->attrs().Find("_tpu_replicate"); }
           : NodeFilter{};
 
-  FunctionLibraryDefinition library(OpRegistry::Global(), {});
   GraphDef optimized_graph_def;
   graph.ToGraphDef(&optimized_graph_def);
-  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(&optimized_graph_def,
-                                                   &library, node_filter));
-  TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library, node_filter));
-  GraphDef converted_graph_def;
-  graph.ToGraphDef(&converted_graph_def);
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(
+      &optimized_graph_def, &library, node_filter,
+      /*include_functions=*/wrap_condition_in_function_));
+  TF_ASSERT_OK(FunctionalizeControlFlow(
+      &graph, &library, node_filter,
+      /*include_functions=*/wrap_condition_in_function_));
 
-  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
-    string op_name;
-    NameAttrList then_fn;
-    NameAttrList else_fn;
-    TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
-    InstantiationResultForTest else_result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+  if (wrap_condition_in_function_) {
+    // Check if function body was functionalized.
+    auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        /*device_mgr=*/nullptr, tensorflow::Env::Default(),
+        /*config=*/nullptr, TF_GRAPH_DEF_VERSION, &library,
+        tensorflow::OptimizerOptions());
+    FunctionLibraryRuntime* flr =
+        pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+    FunctionLibraryRuntime::Handle handle;
 
-    // Outer graph
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-      auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-      auto if_op =
-          ops::If(scope.WithOpName(op_name), less,
-                  std::initializer_list<Input>{less, y, x}, {DT_INT32}, then_fn,
-                  else_fn, ops::If::OutputShapes({PartialTensorShape()}));
-      auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-      TF_EXPECT_GRAPH_EQ(expected, graph_def);
-    }
-
-    // then body.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
-      auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
-      auto cond = ops::Const(
-          scope.WithOpName("cond").WithControlDependencies(identity), 17);
-      auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
-      auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), mul, 0);
-
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(then_fn.name(), library, &result));
-
-      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
-                result.arg_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-    }
-
-    // else body.
-    {
-      Scope scope = Scope::NewRootScope().ExitOnError();
-      auto arg_0 = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
-      auto arg_1 = ops::_Arg(scope.WithOpName("arg1"), DT_INT32, 1);
-      auto arg_2 = ops::_Arg(scope.WithOpName("arg2"), DT_INT32, 2);
-      auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
-      auto cond_1 = ops::Const(
-          scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
-      auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
-      auto retval0 = ops::_Retval(scope.WithOpName("retval0_RetVal"), add, 0);
-
-      GraphDef expected;
-      TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-      InstantiationResultForTest result;
-      TF_EXPECT_OK(
-          InstantiateFunctionForTest(else_fn.name(), library, &result));
-
-      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
-                result.arg_types);
-      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Functionalized function name is the type string of `cond_node`.
+    string func_name;
+    for (Node* n : graph.nodes()) {
+      if (n->name() == "cond_node") {
+        func_name = n->type_string();
+        break;
+      }
     }
+    TF_ASSERT_OK(flr->Instantiate(func_name, AttrSlice(), &handle));
+    const FunctionBody* body = flr->GetFunctionBody(handle);
+    GraphDef graph_def;
+    body->graph->ToGraphDef(&graph_def);
+    CheckGraphDef(graph_def, library);
+  } else {
+    // Check if graphs were functionalized.
+    CheckGraphDef(optimized_graph_def, library);
+    GraphDef converted_graph_def;
+    graph.ToGraphDef(&converted_graph_def);
+    CheckGraphDef(converted_graph_def, library);
   }
 }
 
-TEST_P(ConditionalTestFixture, ConditionalTests) { RunTest(); }
-
-INSTANTIATE_TEST_SUITE_P(
-    FunctionalizeControlFlow, ConditionalTestFixture, ::testing::Bool(),
-    [](const ::testing::TestParamInfo<ConditionalTestFixture::ParamType>&
-           info) { return info.param ? "with_filter" : "without_filter"; });
-
 // Returns the names of the "cond" and "body" functions for the While node
 // in a graph.
 Status FindWhileCondAndBody(const GraphDef& graph, NameAttrList* cond,
diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
index d7a8e67dd33..807c061b60f 100644
--- a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -28,13 +29,26 @@ class BroadcastToOp : public XlaOpKernel {
       : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    const TensorShape input_shape = context->InputShape(0);
     TensorShape output_shape;
     OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape));
+    auto output_status_or =
+        BroadcastTo(context->Input(0), output_shape.dim_sizes());
+    OP_REQUIRES_OK(context, output_status_or.status());
+    auto output = output_status_or.ValueOrDie();
+    std::vector<bool> dynamic_dims;
+    OP_REQUIRES_OK(
+        context, context->ResolveInputDynamismIntoPredVector(1, &dynamic_dims));
+    for (int64 dim = 0; dim < dynamic_dims.size(); ++dim) {
+      if (dynamic_dims[dim]) {
+        output = xla::SetDimensionSize(
+            output,
+            xla::Reshape(xla::Slice(context->Input(1), {dim}, {dim + 1}, {1}),
+                         {}),
+            dim);
+      }
+    }
 
-    auto output = BroadcastTo(context->Input(0), output_shape.dim_sizes());
-    OP_REQUIRES_OK(context, output.status());
-    context->SetOutput(0, output.ValueOrDie());
+    context->SetOutput(0, output);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc b/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc
index 46585a26769..71920372cde 100644
--- a/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/replica_id_op.cc
@@ -30,7 +30,8 @@ class XlaReplicaIdOp : public XlaOpKernel {
 };
 
 void XlaReplicaIdOp::Compile(XlaOpKernelContext* ctx) {
-  ctx->SetOutput(0, xla::ReplicaId(ctx->builder()));
+  ctx->SetOutput(
+      0, xla::ConvertElementType(xla::ReplicaId(ctx->builder()), xla::S32));
 }
 
 REGISTER_XLA_OP(Name("XlaReplicaId"), XlaReplicaIdOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index a85ba547179..213045e428a 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -108,38 +110,73 @@ class ReshapeOp : public XlaOpKernel {
 
     VLOG(2) << "Reshape from " << input_shape.DebugString() << " to "
             << shape.DebugString() << ", unknown_index=" << unknown_index;
+    auto input_xla_shape = ctx->InputXlaShape(0);
+    if (input_xla_shape->is_static()) {
+      ctx->SetOutput(0, xla::Reshape(ctx->Input(0), shape.dim_sizes()));
+      return;
+    }
+    // Handing dynamic reshapes if input contains a dynamic dimension.
+    std::vector<xla::XlaOp> output_dim_sizes;
+    std::vector<bool> dims_are_dynamic;
+    for (int64 i = 0; i < shape.dims(); ++i) {
+      output_dim_sizes.push_back(
+          xla::Reshape(xla::Slice(ctx->Input(1), {i}, {i + 1}, {1}), {}));
+    }
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPredVector(1, &dims_are_dynamic));
+    if (unknown_index == -1) {
+      // No unknown index.
+      ctx->SetOutput(0,
+                     xla::DynamicReshape(ctx->Input(0), output_dim_sizes,
+                                         shape.dim_sizes(), dims_are_dynamic));
+      return;
+    }
+    auto common_factors =
+        xla::CommonFactors(input_shape.dim_sizes(), shape.dim_sizes());
 
-    int dynamic_dimension = -1;
-    if (ctx->InputXlaShape(0)->is_dynamic()) {
-      std::vector<bool> dynamic_dims;
-      OP_REQUIRES_OK(ctx,
-                     ctx->ResolveInputDynamismIntoPredVector(1, &dynamic_dims));
-      for (int d = 0; d < num_dims; ++d) {
-        const bool dim_is_dynamic = dynamic_dims[d];
-        if (dim_is_dynamic) {
-          dynamic_dimension = d;
+    // Find common_factors that the input belongs to.
+    for (int64 i = 0; i < common_factors.size() - 1; ++i) {
+      auto start = common_factors[i];
+      auto end = common_factors[i + 1];
+      bool input_is_dynamic = false;
+      // product of all input dims in this group. E.g., in
+      // reshape(Tensor([2, 3, 3]), [3, -1, 3]) product of the group
+      // containing -1 will be 6.
+      xla::XlaOp product = xla::One(ctx->builder(), xla::S32);
+      for (int64 dim = start.first; dim < end.first; ++dim) {
+        if (input_xla_shape->is_dynamic_dimension(dim)) {
+          input_is_dynamic = true;
+        }
+        product = xla::Mul(product, xla::GetDimensionSize(ctx->Input(0), dim));
+      }
+      bool unknown_dim_in_group = false;
+      // The real size for the -1 dimension in a reshape. E.g., in
+      // reshape(Tensor([2, 3, 3]), [3, -1, 3]) this will be 2.
+      xla::XlaOp unknown_dim_size = product;
+      for (int64 dim = start.second; dim < end.second; ++dim) {
+        if (dim == unknown_index) {
+          unknown_dim_in_group = true;
+        } else {
+          unknown_dim_size = xla::Div(unknown_dim_size, output_dim_sizes[dim]);
         }
       }
 
-      // When reshaping from dynamic dimension, unkwown index is considered
-      // dynamic. E.g.,
-      //   [<=10]
-      //     |
-      // Reshape
-      //     |
-      //   [2, -1]
-      // The second dimension is dynamic.
-      if (dynamic_dimension == -1) {
-        dynamic_dimension = unknown_index;
+      if (unknown_dim_in_group) {
+        // If input dim is dynamic, output dim at the -1 position must be
+        // dynamic. Similarly, if input dim is static, output dim has to be
+        // static at the -1 dimension.
+        dims_are_dynamic[unknown_index] = input_is_dynamic;
+        output_dim_sizes[unknown_index] = unknown_dim_size;
+
+        ctx->SetOutput(
+            0, xla::DynamicReshape(ctx->Input(0), output_dim_sizes,
+                                   shape.dim_sizes(), dims_are_dynamic));
+        VLOG(2) << "Reshape from " << ctx->InputXlaShape(0)->ToString()
+                << " to " << xla::VectorString(shape.dim_sizes())
+                << ", dynamic_dims=" << xla::VectorString(dims_are_dynamic);
+        return;
       }
-      VLOG(2) << "Reshape from " << ctx->InputXlaShape(0)->ToString() << " to "
-              << xla::VectorString(shape.dim_sizes())
-              << ", dynamic_dim=" << dynamic_dimension;
     }
-    // Pass unknown_index to Xla::Reshape as a hint for dynamic shape inference
-    // in XLA to know which output dimension is dynamic.
-    ctx->SetOutput(0, xla::ReshapeWithInferredDimension(
-                          ctx->Input(0), shape.dim_sizes(), dynamic_dimension));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 97359f81eee..d63b8146491 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -74,12 +74,44 @@ class UnsortedSegmentReduce : public XlaOpKernel {
                                   " vs. ", indices_shape.dim_size(d)));
     }
     xla::XlaBuilder* builder = ctx->builder();
+    // data shape = [indices_shape, segment_shape]
+    // buffer shape = [num_segment, segment_shape]
+    // We now create the buffer shape by reverse enginerring data shape into
+    // indices shape and segment shape.
     TensorShape buffer_shape = data_shape;
     buffer_shape.RemoveDimRange(0, indices_shape.dims());
     buffer_shape.InsertDim(0, num_segments);
+
     auto buffer =
         xla::Broadcast(InitialValue(builder), buffer_shape.dim_sizes());
 
+    // Build dynamic dim sizes for buffer, as well as whether each dimension
+    // size is dynamic or static. We build two parts: num_sgement part and
+    // segment_shape part.
+    std::vector<xla::XlaOp> buffer_dims;
+    std::vector<bool> buffer_dims_are_dynamic;
+    // Build the "num_segment" part.
+    bool num_segments_is_dynamic;
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPred(2, &num_segments_is_dynamic));
+
+    buffer_dims.insert(buffer_dims.begin(), ctx->Input(2));
+    buffer_dims_are_dynamic.insert(buffer_dims_are_dynamic.begin(),
+                                   num_segments_is_dynamic);
+    // Build the segment shape part.
+    for (int64 i = indices_shape.dims(); i < data_shape.dims(); ++i) {
+      buffer_dims.push_back(xla::GetDimensionSize(data, i));
+      buffer_dims_are_dynamic.push_back(
+          ctx->InputXlaShape(0)->is_dynamic_dimension(i));
+    }
+
+    for (int64 i = 0; i < buffer_dims.size(); ++i) {
+      if (buffer_dims_are_dynamic[i]) {
+        // For each dynamic dimension, call set-dimension-size on it.
+        buffer = xla::SetDimensionSize(buffer, buffer_dims[i], i);
+      }
+    }
+
     auto combiner = [this](xla::XlaOp a, xla::XlaOp b,
                            xla::XlaBuilder* builder) { return Combine(a, b); };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 784b790767c..72cb746f5ff 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/strided_slice_op.h"
 
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -23,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -33,6 +37,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+using errors::InvalidArgument;
 
 class StridedSliceOp : public XlaOpKernel {
  public:
@@ -48,7 +53,7 @@ class StridedSliceOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape begin_shape = ctx->InputShape("begin");
-
+    VLOG(0) << "strided slice";
     OP_REQUIRES(
         ctx, begin_shape.dims() == 1,
         errors::InvalidArgument("'begin' input has to be a rank 1 vector"));
@@ -78,20 +83,24 @@ class StridedSliceOp : public XlaOpKernel {
     TensorShape final_shape;
     PartialTensorShape dummy_processing_shape, partial_final_shape;
     bool dummy = false;
-    OP_REQUIRES_OK(ctx, ValidateStridedSliceOp(
-                            begin_is_constant ? &begin_tensor : nullptr,
-                            end_is_constant ? &end_tensor : nullptr,
-                            strides_tensor, input_shape, begin_mask_, end_mask_,
-                            ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                            &dummy_processing_shape, &partial_final_shape,
-                            &dummy, &dummy, &dummy, &begin, &end, &strides));
+    absl::InlinedVector<int64, 4> output_to_sparse_mapping;
+    absl::InlinedVector<int64, 4> output_to_processing_mapping;
+    OP_REQUIRES_OK(
+        ctx,
+        ValidateStridedSliceOp(
+            begin_is_constant ? &begin_tensor : nullptr,
+            end_is_constant ? &end_tensor : nullptr, strides_tensor,
+            input_shape, begin_mask_, end_mask_, ellipsis_mask_, new_axis_mask_,
+            shrink_axis_mask_, &dummy_processing_shape, &partial_final_shape,
+            &dummy, &dummy, &dummy, &begin, &end, &strides,
+            &output_to_sparse_mapping, &output_to_processing_mapping));
 
-    OP_REQUIRES(ctx, partial_final_shape.AsTensorShape(&final_shape),
-                errors::InvalidArgument(
-                    "XLA can't deduce compile time constant output "
-                    "shape for strided slice: ",
-                    partial_final_shape.DebugString(),
-                    ", output shape must be a compile-time constant"));
+    OP_REQUIRES(
+        ctx, partial_final_shape.AsTensorShape(&final_shape),
+        InvalidArgument("XLA can't deduce compile time constant output "
+                        "shape for strided slice: ",
+                        partial_final_shape.DebugString(),
+                        ", output shape must be a compile-time constant"));
 
     xla::XlaOp slice = ctx->Input(0);
     if (begin_is_constant && end_is_constant) {
@@ -119,69 +128,84 @@ class StridedSliceOp : public XlaOpKernel {
       auto operand_shape_or = ctx->builder()->GetShape(ctx->Input(0));
       OP_REQUIRES_OK(ctx, operand_shape_or.status());
       xla::Shape xla_shape = operand_shape_or.ValueOrDie();
-      if (xla_shape.is_static()) {
-        // Static output shape, return a static slice.
-        slice = xla::Reshape(slice, final_shape.dim_sizes());
+      std::vector<bool> begins_are_dynamic;
+      OP_REQUIRES_OK(
+          ctx, ctx->ResolveInputDynamismIntoPredVector(1, &begins_are_dynamic));
+      std::vector<bool> ends_are_dynamic;
+      OP_REQUIRES_OK(
+          ctx, ctx->ResolveInputDynamismIntoPredVector(2, &ends_are_dynamic));
+      bool begins_are_static = absl::c_all_of(
+          begins_are_dynamic, [](bool dynamic) { return !dynamic; });
+      OP_REQUIRES(ctx, begins_are_static,
+                  errors::InvalidArgument(
+                      "XLA can't use dynamic begin values for slice."));
+      bool ends_are_static = absl::c_all_of(
+          ends_are_dynamic, [](bool dynamic) { return !dynamic; });
+      // Static output shape, return a static slice.
+      slice = xla::Reshape(slice, final_shape.dim_sizes());
+      if (xla_shape.is_static() && ends_are_static) {
         ctx->SetOutput(0, slice);
         return;
       }
-      auto input_dim_sizes = input_shape.dim_sizes();
 
-      for (int64 i = 0; i < xla_shape.rank(); ++i) {
-        if (xla_shape.is_dynamic_dimension(i)) {
-          input_dim_sizes[i] = -1;
+      for (int64 i = 0; i < final_shape.dims(); ++i) {
+        int64 input_index = output_to_processing_mapping[i];
+        if (input_index == -1) {
+          continue;
         }
-      }
-      PartialTensorShape input_partial_shape(input_dim_sizes);
-      partial_final_shape.Clear();
-      end.clear();
-      strides.clear();
-      begin.clear();
-      // Run shape inferenference again with partial shape.
-      OP_REQUIRES_OK(ctx, ValidateStridedSliceOp(
-                              &begin_tensor, &end_tensor, strides_tensor,
-                              input_partial_shape, begin_mask_, end_mask_,
-                              ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
-                              &dummy_processing_shape, &partial_final_shape,
-                              &dummy, &dummy, &dummy, &begin, &end, &strides));
-      if (partial_final_shape.AsTensorShape(&final_shape)) {
-        // Static output shape, return a static slice.
-        slice = xla::Reshape(slice, final_shape.dim_sizes());
-        ctx->SetOutput(0, slice);
-        return;
-      }
+        bool input_is_dynamic = xla_shape.is_dynamic_dimension(input_index);
 
-      // We consider slicing a dynamic tensor t with negative indices as a
-      // dynamic sized slice. E.g., t[: -n], the result length is shape(t) - n
-      for (int64 i = 0; i < partial_final_shape.dims(); ++i) {
-        bool dynamic_dim = partial_final_shape.dim_size(i) - 1;
-        bool backward_slice = end[i] < 0;
-        if (dynamic_dim && backward_slice) {
+        int64 sparse_index = output_to_sparse_mapping[i];
+        bool end_is_dynamic =
+            sparse_index == -1 ? false : ends_are_dynamic[sparse_index];
+        bool backward_slice = sparse_index == -1
+                                  ? false
+                                  : end_literal.Get<int32>({sparse_index}) < 0;
+        if ((input_is_dynamic && backward_slice) || end_is_dynamic) {
           OP_REQUIRES(
-              ctx, strides[i] == 1,
+              ctx, strides[input_index] == 1,
               errors::InvalidArgument("XLA has not implemented dynamic "
                                       "sized slice with non-trival stride yet. "
                                       "Please file a bug against XLA"));
-
-          OP_REQUIRES(ctx, begin[i] >= 0,
-                      errors::InvalidArgument(
-                          "XLA has not implemented dynamic "
-                          "sized slice with negative begin index %lld. "
-                          "Please file a bug against XLA",
-                          begin[i]));
           // If there is a dynamic dimension, properly set dimension size of
           // the result.
-          auto operand_size = xla::GetDimensionSize(ctx->Input(0), i);
-
-          operand_size = xla::Add(
-              operand_size, xla::ConstantR0<int32>(ctx->builder(), end[i]));
+          auto operand_size = xla::GetDimensionSize(ctx->Input(0), input_index);
+          if (backward_slice) {
+            // We consider slicing a dynamic tensor t with negative indices as
+            // a dynamic sized slice. E.g., t[: -n], the result length is
+            // shape(t) - n.
+            OP_REQUIRES(ctx, !end_is_dynamic,
+                        errors::InvalidArgument(
+                            "XLA has not implemented dynamic "
+                            "sized slice with dynamic negative index %lld. "));
+            operand_size = xla::Add(
+                operand_size,
+                xla::ConstantR0<int32>(ctx->builder(),
+                                       end_literal.Get<int32>({sparse_index})));
+          } else {
+            // The end of slice with dynamic slice size is the min of operand
+            // shape and slice size. E.g., t[:end_size], result size is
+            // min(shape(t), end_size).
+            xla::XlaOp end_size;
+            if (end_is_dynamic) {
+              end_size = xla::Reshape(xla::Slice(ctx->Input(2), {sparse_index},
+                                                 {sparse_index + 1}, {1}),
+                                      {});
+            } else {
+              end_size =
+                  xla::ConstantR0<int32>(ctx->builder(), end[input_index]);
+            }
+            operand_size = xla::Min(operand_size, end_size);
+          }
           slice = xla::SetDimensionSize(
               slice,
-              xla::Sub(operand_size,
-                       xla::ConstantR0<int32>(ctx->builder(), begin[i])),
+              xla::Sub(operand_size, xla::ConstantR0<int32>(
+                                         ctx->builder(), begin[input_index])),
               i);
         }
       }
+      ctx->SetOutput(0, slice);
+      return;
     } else {
       // When output shape is fully defined, it must be a size one slice:
       //
@@ -239,9 +263,9 @@ class StridedSliceOp : public XlaOpKernel {
 
       std::vector<int64> output_shape_dim_sizes;
       slice = xla::DynamicSlice(slice, start_indices, slice_sizes);
+      slice = xla::Reshape(slice, final_shape.dim_sizes());
+      ctx->SetOutput(0, slice);
     }
-    slice = xla::Reshape(slice, final_shape.dim_sizes());
-    ctx->SetOutput(0, slice);
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/lib/data_format.cc b/tensorflow/compiler/tf2xla/lib/data_format.cc
index e5913a8bbf3..eb1ab79d165 100644
--- a/tensorflow/compiler/tf2xla/lib/data_format.cc
+++ b/tensorflow/compiler/tf2xla/lib/data_format.cc
@@ -62,7 +62,7 @@ xla::StatusOr<xla::XlaOp> Expand(xla::XlaOp input, int64 dim) {
   std::vector<int64> expanded_shape =
       xla::SpanToVector(input_shape.dimensions());
   expanded_shape[dim] /= 4;
-  expanded_shape.insert(expanded_shape.begin() + dim, 4);
+  expanded_shape.insert(expanded_shape.begin() + dim + 1, 4);
 
   // Move the newly created dimension to the end with a transpose.
   std::vector<int64> permutation;
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index abaeb305104..db1a6929934 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -152,6 +152,7 @@ Status ConvertGraphDefToXlaViaMlir(
 
   RegisterDialects();
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   TF_ASSIGN_OR_RETURN(
       mlir::OwningModuleRef module,
       ConvertGraphdefToMlir(pruned_graph_def, debug_info, specs, &context));
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 242a2b04ab9..3cf9df64b0b 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -137,7 +137,6 @@ Status ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
       const auto& it = node.attr().find("allowed_devices");
       if (it != node.attr().end()) {
         if (!it->second.list().s().empty()) {
-          // TODO(b/149512838): Support non-empty allowed devices.
           return errors::InvalidArgument(
               "VarHandleOp with non-empty allowed devices is not supported.");
         }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 635b7170d82..f8319cd446a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -990,20 +991,6 @@ Status XlaCompiler::BuildArguments(
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
 
-    for (int i = 0, end = input_to_args->size(); i < end; ++i) {
-      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
-      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
-        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
-        VLOG(1) << "Setting dynamic binding " << i << " -> "
-                << dynamic_size_param_index;
-
-        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
-            /*dynamic_size_param_num=*/0, {dynamic_size_param_index},
-            /*target_param_num=*/0, /*target_param_index=*/{i},
-            dim_and_arg_num.first));
-      }
-    }
-
     for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_shardings.find(i);
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -1035,16 +1022,17 @@ Status XlaCompiler::BuildArguments(
                                         absl::StrCat("arg", i));
       }
     }
+  }
 
-    for (int i = 0, end = input_to_args->size(); i < end; ++i) {
-      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
-      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
-        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
-        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
-            /*dynamic_size_param_num=*/dynamic_size_param_index, {},
-            /*target_param_num=*/i, /*target_param_index=*/{},
-            dim_and_arg_num.first));
-      }
+  for (int i = 0, end = input_to_args->size(); i < end; ++i) {
+    const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+    for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+      int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+      VLOG(1) << "Setting dynamic size " << i << " -> "
+              << dynamic_size_param_index;
+      arg_handles[i] = xla::SetDimensionSize(
+          arg_handles[i], arg_handles[dynamic_size_param_index],
+          dim_and_arg_num.first);
     }
   }
 
@@ -1370,8 +1358,15 @@ Status XlaCompiler::SetDeviceToHostMetadata(
     const string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
   if (host_compute_sends_.find(key) != host_compute_sends_.end()) {
-    return errors::InvalidArgument(
-        "Duplicate calls to SetDeviceToHostMetadata with key ", key);
+    tf2xla::HostTransferMetadata& existing_transfer = host_compute_sends_[key];
+    tf2xla::HostTransferMetadata new_transfer;
+    SetTransfer(key, types, shapes, &new_transfer);
+    if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument(
+          "Duplicate calls to SetDeviceToHostMetadata with key ", key);
+    }
   }
   tf2xla::HostTransferMetadata& transfer = host_compute_sends_[key];
   SetTransfer(key, types, shapes, &transfer);
@@ -1396,9 +1391,16 @@ Status XlaCompiler::GetDeviceToHostShapes(
 Status XlaCompiler::SetHostToDeviceMetadata(
     const string& key, absl::Span<const DataType> types,
     absl::Span<const TensorShape> shapes) {
-  if (host_compute_recvs_.find(key) != host_compute_sends_.end()) {
-    return errors::InvalidArgument(
-        "Duplicate calls to SetHostToDeviceMetadata with key ", key);
+  if (host_compute_recvs_.find(key) != host_compute_recvs_.end()) {
+    tf2xla::HostTransferMetadata& existing_transfer = host_compute_recvs_[key];
+    tf2xla::HostTransferMetadata new_transfer;
+    SetTransfer(key, types, shapes, &new_transfer);
+    if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument(
+          "Duplicate calls to SetHostToDeviceMetadata with key ", key);
+    }
   }
   tf2xla::HostTransferMetadata& transfer = host_compute_recvs_[key];
   SetTransfer(key, types, shapes, &transfer);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 5df508d60b3..f348552050b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -1897,5 +1897,63 @@ TEST_F(XlaCompilerTest, AliasResourceUpdates) {
   EXPECT_EQ(alias.entries(0).parameter_number(), 0);
 }
 
+// Tests that passing in an exact duplicate input to SetDeviceToHostMeatadata
+// is not an error.
+TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+
+  TF_ASSERT_OK(compiler.SetDeviceToHostMetadata(key, types, shapes));
+  TF_ASSERT_OK(compiler.SetDeviceToHostMetadata(key, types, shapes));
+}
+
+// Tests that passing in a mismatched duplicate input to
+// SetDeviceToHostMeatadata is not an error.
+TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+  std::vector<DataType> types2{DT_FLOAT};
+  std::vector<TensorShape> shapes2{TensorShape({1})};
+
+  TF_ASSERT_OK(compiler.SetDeviceToHostMetadata(key, types, shapes));
+  Status status = compiler.SetDeviceToHostMetadata(key, types2, shapes2);
+  EXPECT_EQ(status.code(), error::Code::INVALID_ARGUMENT);
+}
+
+// Tests that passing in an exact duplicate input to SetHostToDeviceMeatadata
+// is not an error.
+TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+
+  TF_ASSERT_OK(compiler.SetHostToDeviceMetadata(key, types, shapes));
+  TF_ASSERT_OK(compiler.SetHostToDeviceMetadata(key, types, shapes));
+}
+
+// Tests that passing in a mismatched duplicate input to
+// SetHostToDeviceMeatadata is not an error.
+TEST_F(XlaCompilerTest, SetHostToDeviceMetadataMismatchedDuplicate) {
+  XlaCompiler compiler(DefaultOptions());
+
+  const string& key = "comm_key";
+  std::vector<DataType> types{DT_INT32};
+  std::vector<TensorShape> shapes{TensorShape({2})};
+  std::vector<DataType> types2{DT_FLOAT};
+  std::vector<TensorShape> shapes2{TensorShape({1})};
+
+  TF_ASSERT_OK(compiler.SetHostToDeviceMetadata(key, types, shapes));
+  Status status = compiler.SetHostToDeviceMetadata(key, types2, shapes2);
+  EXPECT_EQ(status.code(), error::Code::INVALID_ARGUMENT);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xla/bit_cast.h b/tensorflow/compiler/xla/bit_cast.h
index 90e9a5c25dd..feb548c9433 100644
--- a/tensorflow/compiler/xla/bit_cast.h
+++ b/tensorflow/compiler/xla/bit_cast.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 20d9930341f..744cdcea14c 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -137,7 +137,7 @@ XlaComputation CreateMinMaxComputation(XlaBuilder* outer_builder,
     arg_max = Select(eq, tie_id, arg_max);
   }
   Tuple(b, {max, arg_max});
-  return b->Build().ConsumeValueOrDie();
+  return b->BuildAndNoteError();
 }
 
 XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min,
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
index cd594a5cf39..c9d6cea740d 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.cc
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -84,7 +84,12 @@ XlaComputation CreateScalarComparisonComputation(
 
   CHECK_NE(parameter_count, 0);
 
-  Shape shape = b->GetShape(lhs_params[0]).ValueOrDie();
+  auto shape_or = b->GetShape(lhs_params[0]);
+  if (!shape_or.ok()) {
+    b->ReportError(shape_or.status());
+    return {};
+  }
+  Shape shape = shape_or.ValueOrDie();
   shape.set_element_type(PRED);
   XlaOp param_equal = Broadcast(One(b.get(), shape.element_type()),
                                 AsInt64Slice(shape.dimensions()));
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 6fdaab58686..cd9f88a74ce 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1111,11 +1111,28 @@ XlaOp RoundToEven(XlaOp x) {
 
 // acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) if x != -1
 //           pi                                if x == -1
+// For complex:
+// acos(x) = -(i * log(x + i * sqrt((1 + x) * (1 - x))))
 XlaOp Acos(XlaOp x) {
-  return Select(Ne(x, FullLike(x, -1)),
-                ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
-                                           ScalarLike(x, 1.0) + x),
-                FullLike(x, M_PI));
+  XlaBuilder* b = x.builder();
+  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+
+    if (primitive_util::IsComplexType(shape.element_type())) {
+      auto one = ScalarLike(x, 1);
+      auto imag_one = Complex(
+          Zero(b, primitive_util::ComplexComponentType(shape.element_type())),
+          One(b, primitive_util::ComplexComponentType(shape.element_type())));
+
+      auto result =
+          Neg(imag_one * Log(x + imag_one * Sqrt((one + x) * (one - x))));
+      return result;
+    }
+    return Select(Ne(x, FullLike(x, -1)),
+                  ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
+                                             ScalarLike(x, 1.0) + x),
+                  FullLike(x, M_PI));
+  });
 }
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index cb79b2ef7db..ae4d839d8fa 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -660,5 +660,19 @@ XLA_TEST_F(MathTest, BesselI1eDouble) {
   ComputeAndCompareR1<double>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, AcosComplexValues) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<std::complex<float>>(
+      &builder, {{0, 0}, {0, 1}, {1, 1}, {0.8, 0.2}});
+
+  Acos(x);
+  std::vector<std::complex<float>> expected = {
+      {1.5707963267948966, 0},
+      {1.5707963267948966, -0.881373587019543},
+      {0.9045568943023814, -1.0612750619050357},
+      {0.7011246914497526, -0.30527648462436596}};
+  ComputeAndCompareR1<std::complex<float>>(&builder, expected, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 044a742eddd..cc5639f1be1 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -426,32 +426,36 @@ RngOutput PhiloxRngBit64(XlaOp op_key, XlaOp initial_state,
 XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
                                               XlaOp maxval) {
   XlaBuilder* builder = bits.builder();
-  PrimitiveType value_type =
-      builder->GetShape(minval).ConsumeValueOrDie().element_type();
-  PrimitiveType bit_type =
-      builder->GetShape(bits).ConsumeValueOrDie().element_type();
-  CHECK((value_type == F32 && bit_type == U32) ||
-        (value_type == F64 && bit_type == U64));
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape* minval_shape,
+                        builder->GetShapePtr(minval));
+    TF_ASSIGN_OR_RETURN(const Shape* bits_shape, builder->GetShapePtr(bits));
+    PrimitiveType value_type = minval_shape->element_type();
+    PrimitiveType bit_type = bits_shape->element_type();
+    CHECK((value_type == F32 && bit_type == U32) ||
+          (value_type == F64 && bit_type == U64));
 
-  // Form random mantissa bits for float/double, with a leading 1 bit.
-  int num_float_bits = primitive_util::BitWidth(value_type);
-  // Subtract one as SignificandWidth includes the leading 1 bit.
-  int num_mantissa_bits = primitive_util::SignificandWidth(value_type) - 1;
+    // Form random mantissa bits for float/double, with a leading 1 bit.
+    int num_float_bits = primitive_util::BitWidth(value_type);
+    // Subtract one as SignificandWidth includes the leading 1 bit.
+    int num_mantissa_bits = primitive_util::SignificandWidth(value_type) - 1;
 
-  // Ignore the exponent bits and convert the mantissa bits to the floating
-  // point type.
-  bits = ShiftRightLogical(
-      bits, ScalarLike(bits, num_float_bits - num_mantissa_bits));
+    // Ignore the exponent bits and convert the mantissa bits to the floating
+    // point type.
+    bits = ShiftRightLogical(
+        bits, ScalarLike(bits, num_float_bits - num_mantissa_bits));
 
-  // We have an integer-valued floating point number in the range
-  // [0, 2**{num_mantissa_bits}).
-  XlaOp values = ConvertElementType(bits, value_type);
+    // We have an integer-valued floating point number in the range
+    // [0, 2**{num_mantissa_bits}).
+    XlaOp values = ConvertElementType(bits, value_type);
 
-  // Divide by 2**{-num_mantissa_bits} to get a number in the range [0.0, 1.0).
-  values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
+    // Divide by 2**{-num_mantissa_bits} to get a number in the range
+    // [0.0, 1.0).
+    values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
 
-  // Multiply and add to shift to the range [minval, maxval).
-  return values * (maxval - minval) + minval;
+    // Multiply and add to shift to the range [minval, maxval).
+    return values * (maxval - minval) + minval;
+  });
 }
 
 XlaOp ConvertRandomBitsToUniformInt(XlaOp bits, XlaOp minval, XlaOp maxval,
diff --git a/tensorflow/compiler/xla/client/lib/quantize.h b/tensorflow/compiler/xla/client/lib/quantize.h
index 26dbbd5b00b..320dfcbf062 100644
--- a/tensorflow/compiler/xla/client/lib/quantize.h
+++ b/tensorflow/compiler/xla/client/lib/quantize.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
index 1c0680b883a..58905e4ca6f 100644
--- a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
@@ -228,7 +228,7 @@ StatusOr<std::vector<XlaOp>> WhileLoopFn(
     auto max_sweeps = ScalarLike(k, max_sweep_updates);
     auto sweep_update_cond = Gt(max_sweeps, k);
 
-    auto norms = ComputeFrobeniusNorms(values[2]).ValueOrDie();
+    TF_ASSIGN_OR_RETURN(auto norms, ComputeFrobeniusNorms(values[2]));
     auto tol = norms.total_norm * values[3];
     auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
                               xla::ConstantR0<bool>(cond_builder, false),
@@ -400,7 +400,7 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
     return result;
   };
   auto shape_with_status = builder->GetShape(a);
-  if (!shape_with_status.status().ok()) {
+  if (!shape_with_status.ok()) {
     return return_error(shape_with_status.status());
   }
   Shape a_shape = shape_with_status.ValueOrDie();
@@ -450,7 +450,7 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
       S32,                     //
       "CyclicJacobi",          //
       builder);
-  if (!output_with_status.status().ok()) {
+  if (!output_with_status.ok()) {
     return return_error(output_with_status.status());
   }
 
@@ -460,7 +460,11 @@ SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
   result.v = output[1];
   result.w = GetMatrixDiagonal(output[2]);
 
-  return SortByEigenvalues(result).ValueOrDie();
+  auto result_or = SortByEigenvalues(result);
+  if (!result_or.ok()) {
+    return return_error(result_or.status());
+  }
+  return result_or.ValueOrDie();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/svd.cc b/tensorflow/compiler/xla/client/lib/svd.cc
index 646875a20a2..80ea4d644c0 100644
--- a/tensorflow/compiler/xla/client/lib/svd.cc
+++ b/tensorflow/compiler/xla/client/lib/svd.cc
@@ -837,8 +837,11 @@ SVDResult SVD(XlaOp a, int64 max_iter, float epsilon,
 
   auto eps = ScalarLike(a, epsilon);
 
-  SVDResult svd_result =
-      HouseHolderBidiagonalization(a, eps, precision).ValueOrDie();
+  auto svd_result_or = HouseHolderBidiagonalization(a, eps, precision);
+  if (!svd_result_or.ok()) {
+    return return_error(svd_result_or.status());
+  }
+  SVDResult svd_result = svd_result_or.ValueOrDie();
 
   auto output_with_status = WhileLoopFn(
       {
@@ -861,7 +864,13 @@ SVDResult SVD(XlaOp a, int64 max_iter, float epsilon,
   svd_result.u = output[1];
   svd_result.v = output[2];
   svd_result.d = output[3];
-  svd_result = SortBySingularValuesAndPostProcessing(svd_result).ValueOrDie();
+
+  svd_result_or = SortBySingularValuesAndPostProcessing(svd_result);
+  if (!svd_result_or.ok()) {
+    return return_error(svd_result_or.status());
+  }
+  svd_result = svd_result_or.ValueOrDie();
+
   if (maybe_transpose) {
     std::swap(svd_result.u, svd_result.v);
   }
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 2b69c71042d..34d78f9d933 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
@@ -78,16 +79,13 @@ ShapeProto ConvertShapeProtoToPred(const ShapeProto& shape_proto) {
   return ShapeUtil::ChangeElementType(Shape(shape_proto), PRED).ToProto();
 }
 
-HloInstructionProto CreateConstantInstruction(int64 id, const Shape& shape,
-                                              bool pred) {
-  HloInstructionProto const_instr;
+void SetInstructionAsConstant(HloInstructionProto* instr, int64 id,
+                              const Shape& shape, bool pred) {
   Literal literal = LiteralUtil::CreateR0(pred);
   Literal literal_broadcast = literal.Broadcast(shape, {}).ValueOrDie();
-  *const_instr.mutable_shape() = shape.ToProto();
-  *const_instr.mutable_literal() = literal_broadcast.ToProto();
-  *const_instr.mutable_opcode() = HloOpcodeString(HloOpcode::kConstant);
-  const_instr.set_id(id);
-  return const_instr;
+  *instr->mutable_shape() = shape.ToProto();
+  *instr->mutable_literal() = literal_broadcast.ToProto();
+  *instr->mutable_opcode() = HloOpcodeString(HloOpcode::kConstant);
 }
 
 // Converts a HloComputation into ReducerOr with predicate types.
@@ -1083,6 +1081,36 @@ XlaOp XlaBuilder::Reshape(const Shape& shape, XlaOp operand,
   });
 }
 
+XlaOp XlaBuilder::DynamicReshape(XlaOp operand,
+                                 absl::Span<const XlaOp> dim_sizes,
+                                 absl::Span<const int64> new_size_bounds,
+                                 const std::vector<bool>& dims_are_dynamic) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    std::vector<const Shape*> dim_size_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& dim_size_shapes,
+                        GetOperandShapes(dim_sizes));
+
+    absl::c_transform(dim_size_shapes, std::back_inserter(dim_size_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const Shape shape,
+                        ShapeInference::InferDynamicReshapeShape(
+                            *operand_shape, dim_size_shape_ptrs,
+                            new_size_bounds, dims_are_dynamic));
+    TF_RETURN_IF_ERROR(first_error_);
+    std::vector<XlaOp> operands;
+    operands.reserve(1 + dim_sizes.size());
+    operands.push_back(operand);
+    for (const XlaOp& dim_size : dim_sizes) {
+      operands.push_back(dim_size);
+    }
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicReshape,
+                          operands);
+  });
+}
+
 XlaOp XlaBuilder::Collapse(XlaOp operand, absl::Span<const int64> dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     if (dimensions.size() <= 1) {
@@ -1425,6 +1453,25 @@ StatusOr<XlaOp> XlaBuilder::FftInternal(
   return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
 }
 
+StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
+    const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options) {
+  HloInstructionProto instr;
+  *instr.mutable_triangular_solve_options() = std::move(options);
+  *instr.mutable_shape() = shape.ToProto();
+
+  return AddInstruction(std::move(instr), HloOpcode::kTriangularSolve, {a, b});
+}
+
+StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
+                                             bool lower) {
+  HloInstructionProto instr;
+  xla::CholeskyOptions& options = *instr.mutable_cholesky_options();
+  options.set_lower(lower);
+  *instr.mutable_shape() = shape.ToProto();
+
+  return AddInstruction(std::move(instr), HloOpcode::kCholesky, {a});
+}
+
 XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1935,7 +1982,6 @@ XlaOp XlaBuilder::RngUniform(XlaOp a, XlaOp b, const Shape& shape) {
 XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
                                   XlaOp initial_state, const Shape& shape) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
     TF_ASSIGN_OR_RETURN(Shape state_shape, GetShape(initial_state));
     Shape output_shape = shape;
@@ -1954,14 +2000,22 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
         return InvalidArgument("Unsupported shape for RngBitGenerator: %s",
                                PrimitiveType_Name(output_shape.element_type()));
     }
-    *instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({state_shape, output_shape}).ToProto();
-    instr.set_rng_algorithm(algorithm);
-    return AddInstruction(std::move(instr), HloOpcode::kRngBitGenerator,
-                          {initial_state});
+    return RngBitGeneratorInternal(
+        ShapeUtil::MakeTupleShape({state_shape, output_shape}), algorithm,
+        initial_state);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
+    const Shape& full_result_shape, RandomAlgorithm algorithm,
+    XlaOp initial_state) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = full_result_shape.ToProto();
+  instr.set_rng_algorithm(algorithm);
+  return AddInstruction(std::move(instr), HloOpcode::kRngBitGenerator,
+                        {initial_state});
+}
+
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, XlaOp init) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -2527,6 +2581,7 @@ XlaOp XlaBuilder::AllToAll(XlaOp operand, int64 split_dimension,
         }
         *(shape.mutable_tuple_shapes(i)->mutable_layout()) = *layout;
       }
+      instr.set_constrain_layout(true);
     }
     *instr.mutable_shape() = shape.ToProto();
 
@@ -2914,27 +2969,12 @@ StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
   *program_shape->mutable_result() =
       ShapeUtil::ChangeElementType(Shape(root->shape()), PRED).ToProto();
 
-  std::set<int64> seen;
-  struct WorkItem {
-    explicit WorkItem(int64 handle, bool need_rewrite)
-        : handle(handle), need_rewrite(need_rewrite) {}
-    int64 handle;
-    // If need_rewrite is true, the instruction will be copied and rewrite into
-    // a pred instruction indicating if each value is dynamic. If need_rewrite
-    // is false, simply copy the instruction to the output graph.
-    // E.g.,
-    // For select(P, A, B), we need to rewrite A and B into predicates, but
-    // don't need to rewrite P.
-    bool need_rewrite;
-  };
-  std::queue<WorkItem> worklist;
-  worklist.push(WorkItem(root->id(), true));
-  entry.set_root_id(root->id());
   std::vector<HloComputationProto> called_computatons;
-  // Rewritre instruction with id "from" into the new graph.
-  // Returns more work items that need to finish.
-  auto rewrite_instruction =
-      [&](int64 from, bool need_rewrite) -> StatusOr<std::vector<WorkItem>> {
+  // Process instruction and copy it into the new graph. The new node in the new
+  // graph with have id set to `id`.
+  auto process_instruction = [&](const HloInstructionProto* instr_proto,
+                                 bool need_rewrite, int64 id,
+                                 absl::Span<int64 const> operand_ids) {
     // Rewrite the instruction with following rules:
     // - Unary ops: Convert into bitcast (identity) with type Pred.
     // - Binary ops: Convert into binary or.
@@ -2947,22 +2987,20 @@ StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
     // - Constant: Convert to constant False.
     // - Other ops: Not supported.
     // Create the instruction for the new handle.
-    TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_proto,
-                        LookUpInstructionByHandle(from));
-
     TF_ASSIGN_OR_RETURN(HloOpcode opcode,
                         StringToHloOpcode(instr_proto->opcode()));
-    std::vector<WorkItem> operands_todo;
     auto* new_instr = entry.add_instructions();
     *new_instr = *instr_proto;
-    for (auto operand_id : new_instr->operand_ids()) {
-      operands_todo.emplace_back(operand_id, need_rewrite);
+    new_instr->set_id(id);
+    new_instr->mutable_operand_ids()->Clear();
+    for (auto operand_id : operand_ids) {
+      new_instr->mutable_operand_ids()->Add(operand_id);
     }
 
     if (!need_rewrite) {
       *new_instr->mutable_name() =
-          GetFullName(instr_proto->opcode(), kNameSeparator, instr_proto->id());
-      return operands_todo;
+          GetFullName(instr_proto->opcode(), kNameSeparator, id);
+      return Status::OK();
     }
     *new_instr->mutable_shape() = ConvertShapeProtoToPred(instr_proto->shape());
     Shape new_shape(new_instr->shape());
@@ -3017,10 +3055,8 @@ StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
         *new_instr->mutable_opcode() = HloOpcodeString(HloOpcode::kOr);
         break;
       case HloOpcode::kSelect:
-        operands_todo[0].need_rewrite = false;
         break;
       case HloOpcode::kGather:
-        operands_todo[1].need_rewrite = false;
         break;
       case HloOpcode::kReduce: {
         int64 reducer_id = new_instr->called_computation_ids(0);
@@ -3042,39 +3078,101 @@ StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
         TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                             LookUpInstructionByHandle(operand_handle));
 
-        *new_instr = CreateConstantInstruction(
-            from, new_shape,
+        SetInstructionAsConstant(
+            new_instr, id, new_shape,
             operand_proto->shape().is_dynamic_dimension(dimension));
-        operands_todo.clear();
         break;
       }
       case HloOpcode::kConstant:
-        *new_instr = CreateConstantInstruction(from, new_shape, false);
+        SetInstructionAsConstant(new_instr, id, new_shape, false);
         break;
       case HloOpcode::kParameter:
-        *new_instr = CreateConstantInstruction(from, new_shape, true);
+        SetInstructionAsConstant(new_instr, id, new_shape, true);
         break;
       default:
         return InvalidArgument("Dynamic inferencing %s is not supported",
                                instr_proto->DebugString());
     }
     *new_instr->mutable_name() =
-        GetFullName(instr_proto->opcode(), kNameSeparator, instr_proto->id());
-    return operands_todo;
+        GetFullName(instr_proto->opcode(), kNameSeparator, id);
+    return Status::OK();
   };
 
+  struct WorkItem {
+    explicit WorkItem(int64 handle, bool need_rewrite)
+        : handle(handle), need_rewrite(need_rewrite), visited(false) {}
+    int64 handle;
+    // If need_rewrite is true, the instruction will be copied and rewrite into
+    // a pred instruction indicating if each value is dynamic. If need_rewrite
+    // is false, simply copy the instruction to the output graph.
+    // E.g.,
+    // For select(P, A, B), we need to rewrite A and B into predicates, but
+    // don't need to rewrite P.
+    bool need_rewrite;
+    // Used in dfs to remember the ids of processed operands of this item.
+    std::vector<int64> processed_operands;
+    // Whether this node been visited before or not.
+    bool visited;
+  };
+  // Only copy each pair of {handle, need_rewrite} once. Value is the id in the
+  // new graph.
+  absl::flat_hash_map<std::pair<int64, bool>, int64> seen;
+  // Monotonically increasing id to assign to new instructions.
+  int64 global_id = 0;
+  // The result id of the last rewritten item -- return value of last stack
+  // item.
+  int64 stacktop_id = -1;
+  std::vector<WorkItem> worklist;
+  worklist.push_back(WorkItem(root->id(), true));
   while (!worklist.empty()) {
-    WorkItem item = worklist.front();
-    worklist.pop();
-    if (!seen.insert(item.handle).second) {
+    WorkItem& item = worklist.back();
+    auto item_key = std::make_pair(item.handle, item.need_rewrite);
+    auto iter = seen.find(item_key);
+    // Already processed this item. Return previous results.
+    if (iter != seen.end()) {
+      stacktop_id = iter->second;
+      worklist.pop_back();
       continue;
     }
-    TF_ASSIGN_OR_RETURN(auto todos,
-                        rewrite_instruction(item.handle, item.need_rewrite));
-    for (WorkItem& todo : todos) {
-      worklist.push(todo);
+
+    int64 next_operand = item.processed_operands.size();
+    TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_proto,
+                        LookUpInstructionByHandle(item.handle));
+    VLOG(3) << "Processing" << instr_proto->name();
+    if (!item.visited) {
+      item.visited = true;
+    } else {
+      // Record previous processed operand.
+      item.processed_operands.push_back(stacktop_id);
+      next_operand++;
     }
+    TF_ASSIGN_OR_RETURN(HloOpcode opcode,
+                        StringToHloOpcode(instr_proto->opcode()));
+    if (next_operand >= instr_proto->operand_ids_size() ||
+        opcode == HloOpcode::kGetDimensionSize) {
+      // No more operands to process, process self.
+      int64 new_id = ++global_id;
+      VLOG(3) << "new_id: " << new_id << "instr: " << instr_proto->name();
+      TF_RETURN_IF_ERROR(process_instruction(instr_proto, item.need_rewrite,
+                                             new_id, item.processed_operands));
+      stacktop_id = new_id;
+      seen[item_key] = stacktop_id;
+      worklist.pop_back();
+      continue;
+    }
+
+    WorkItem next_item(instr_proto->operand_ids(next_operand), true);
+    if (opcode == HloOpcode::kSelect && next_operand == 0) {
+      next_item.need_rewrite = false;
+    }
+    if (opcode == HloOpcode::kGather && next_operand == 1) {
+      next_item.need_rewrite = false;
+    }
+    // Push next operand into worklist.
+    worklist.push_back(next_item);
   }
+  TF_RET_CHECK(stacktop_id != -1);
+  entry.set_root_id(stacktop_id);
   absl::c_sort(*entry.mutable_instructions(),
                [](const HloInstructionProto& p1,
                   const HloInstructionProto& p2) { return p1.id() < p2.id(); });
@@ -3466,6 +3564,13 @@ XlaOp Reshape(const Shape& shape, XlaOp operand) {
   return operand.builder()->Reshape(shape, operand);
 }
 
+XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                     absl::Span<const int64> new_size_bounds,
+                     const std::vector<bool>& dims_are_dynamic) {
+  return operand.builder()->DynamicReshape(operand, dim_sizes, new_size_bounds,
+                                           dims_are_dynamic);
+}
+
 XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                    absl::Span<const int64> new_sizes,
                                    int64 inferred_dimension) {
@@ -3684,36 +3789,26 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
                       TriangularSolveOptions::Transpose transpose_a) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
     TF_ASSIGN_OR_RETURN(const Shape* b_shape, builder->GetShapePtr(b));
-    xla::TriangularSolveOptions& options =
-        *instr.mutable_triangular_solve_options();
+    xla::TriangularSolveOptions options;
     options.set_left_side(left_side);
     options.set_lower(lower);
     options.set_unit_diagonal(unit_diagonal);
     options.set_transpose_a(transpose_a);
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTriangularSolveShape(
                                          *a_shape, *b_shape, options));
-    *instr.mutable_shape() = shape.ToProto();
-
-    return builder->AddInstruction(std::move(instr),
-                                   HloOpcode::kTriangularSolve, {a, b});
+    return builder->TriangularSolveInternal(shape, a, b, std::move(options));
   });
 }
 
 XlaOp Cholesky(XlaOp a, bool lower) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
-    xla::CholeskyOptions& options = *instr.mutable_cholesky_options();
-    options.set_lower(lower);
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferCholeskyShape(*a_shape));
-    *instr.mutable_shape() = shape.ToProto();
-
-    return builder->AddInstruction(std::move(instr), HloOpcode::kCholesky, {a});
+    return builder->CholeskyInternal(shape, a, lower);
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 6d30195d3d0..f841a1a75a0 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -366,6 +366,7 @@ class XlaBuilder {
   //
   // TODO(b/119520625): Remove this API once we have more dynamic shape infra
   // ready.
+  ABSL_DEPRECATED("Use SetDimensionSize to set a dynamic dimension.")
   Status SetDynamicBinding(int64 dynamic_size_param_num,
                            ShapeIndex dynamic_size_param_index,
                            int64 target_param_num,
@@ -454,6 +455,10 @@ class XlaBuilder {
   XlaOp Reshape(const Shape& shape, XlaOp operand,
                 int64 inferred_dimension = -1);
 
+  XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                       absl::Span<const int64> new_size_bounds,
+                       const std::vector<bool>& dims_are_dynamic);
+
   XlaOp Collapse(XlaOp operand, absl::Span<const int64> dimensions);
 
   XlaOp Slice(XlaOp operand, absl::Span<const int64> start_indices,
@@ -553,6 +558,12 @@ class XlaBuilder {
                                       FftType fft_type,
                                       absl::Span<const int64> fft_length);
 
+  virtual StatusOr<XlaOp> TriangularSolveInternal(
+      const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options);
+
+  virtual StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                           bool lower);
+
   XlaOp Infeed(const Shape& shape, const string& config = "");
   XlaOp InfeedWithToken(XlaOp token, const Shape& shape, const string& config);
   virtual StatusOr<XlaOp> InfeedWithTokenInternal(
@@ -701,6 +712,11 @@ class XlaBuilder {
 
   XlaOp RngBitGenerator(RandomAlgorithm algorithm, XlaOp initial_state,
                         const Shape& shape);
+  // Internal variant for the op with the full result shape containing both data
+  // and state shape as a tuple.
+  virtual StatusOr<XlaOp> RngBitGeneratorInternal(
+      const Shape& full_result_shape, RandomAlgorithm algorithm,
+      XlaOp initial_state);
 
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               XlaOp init);
@@ -773,8 +789,13 @@ class XlaBuilder {
 
   XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
 
-  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
-                                 absl::Span<const XlaOp> operands = {});
+  virtual StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                         HloOpcode opcode,
+                                         absl::Span<const XlaOp> operands);
+  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                 HloOpcode opcode) {
+    return AddInstruction(std::move(instr), opcode, /*operands=*/{});
+  }
 
   void AddCalledComputation(const XlaComputation& computation,
                             HloInstructionProto* instr);
@@ -940,6 +961,10 @@ class XlaBuilder {
 
   friend XlaOp Reshape(const Shape& shape, XlaOp operand);
 
+  friend XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                              absl::Span<const int64> new_size_bounds,
+                              const std::vector<bool>& dims_are_dynamic);
+
   friend XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                             absl::Span<const int64> new_sizes,
                                             int64 inferred_dimension);
@@ -1453,9 +1478,16 @@ XlaOp Pad(XlaOp operand, XlaOp padding_value,
 XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
               absl::Span<const int64> new_sizes);
 
-// Enqueues an operation onto the computation that collapses the operand, from
-// first to last dimension (C order), then reshapes it to the given dimension
-// sizes. Conceptually, this is a limited form of "shape casting".
+// Enqueues a dynamic reshape operation. The dynamic reshape takes additional
+// XlaOps as sizes for the result dimension. The result dim i is a dynamic
+// dimension dimension if dims_are_dynamic[i] is true.
+XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
+                     absl::Span<const int64> new_size_bounds,
+                     const std::vector<bool>& dims_are_dynamic);
+
+// Enqueues an operation onto the computation that collapses the operand,
+// from first to last dimension (C order), then reshapes it to the given
+// dimension sizes. Conceptually, this is a limited form of "shape casting".
 XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes);
 
 // Enqueues a Reshape op that uses an explicit target shape.
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
index be70c16fc12..e2543bda7df 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@@ -25,8 +25,8 @@ static const char kCpuPlatformName[] = "cpu";
 
 CpuDevice::CpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state)
-    : Device(id, std::move(local_device_state), kCpuPlatformName,
-             /*device_kind=*/kCpuPlatformName) {}
+    : PjRtDevice(id, std::move(local_device_state), kCpuPlatformName,
+                 /*device_kind=*/kCpuPlatformName) {}
 
 StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
@@ -39,7 +39,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
 
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   for (int i = 0; i < client->device_count(); ++i) {
     se::StreamExecutorConfig config;
     config.ordinal = i;
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.h b/tensorflow/compiler/xla/pjrt/cpu_device.h
index c70d90ae228..ad0079b1c4a 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-class CpuDevice : public Device {
+class CpuDevice : public PjRtDevice {
  public:
   CpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state);
 };
diff --git a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
index d54be61fbb8..298c41c7f58 100644
--- a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
@@ -32,7 +32,7 @@ TEST(GpuMultiStream, Basics) {
       GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
                          /*distributed_client=*/nullptr, /*node_id=*/0));
 
-  Device* device = client->local_devices().at(0);
+  PjRtDevice* device = client->local_devices().at(0);
 
   int n = 1024;
   Shape shape = ShapeUtil::MakeShape(S32, {n});
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.cc b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
index f7138a8c181..c1149f2dbf9 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@@ -25,8 +25,8 @@ static const char kInterpreterPlatformName[] = "interpreter";
 
 InterpreterDevice::InterpreterDevice(
     int id, std::unique_ptr<LocalDeviceState> local_device_state)
-    : Device(id, std::move(local_device_state), kInterpreterPlatformName,
-             /*device_kind=*/kInterpreterPlatformName) {}
+    : PjRtDevice(id, std::move(local_device_state), kInterpreterPlatformName,
+                 /*device_kind=*/kInterpreterPlatformName) {}
 
 StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
@@ -40,7 +40,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
 
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   se::StreamExecutor* executor =
       client->backend().stream_executor(0).ValueOrDie();
   auto device_state = absl::make_unique<LocalDeviceState>(
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.h b/tensorflow/compiler/xla/pjrt/interpreter_device.h
index 58b210ad762..cf732f70124 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.h
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-class InterpreterDevice : public Device {
+class InterpreterDevice : public PjRtDevice {
  public:
   InterpreterDevice(int id,
                     std::unique_ptr<LocalDeviceState> local_device_state);
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index edffaf6c877..512ff81ef6e 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -207,9 +207,9 @@ StatusOr<std::string> NcclIdStore::GetNcclUniqueId(const NcclCliqueKey& key) {
   return cache_.emplace(key_string, result.ValueOrDie()).first->second;
 }
 
-std::vector<std::unique_ptr<Device>> BuildLocalDevices(
+std::vector<std::unique_ptr<PjRtDevice>> BuildLocalDevices(
     std::vector<std::unique_ptr<LocalDeviceState>> local_device_states) {
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   for (auto& local_device : local_device_states) {
     int device_ordinal = local_device->device_ordinal();
     const se::DeviceDescription& description =
@@ -225,7 +225,7 @@ std::vector<std::unique_ptr<Device>> BuildLocalDevices(
 Status BuildDistributedDevices(
     std::vector<std::unique_ptr<LocalDeviceState>> local_device_states,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
-    std::vector<std::unique_ptr<Device>>* devices,
+    std::vector<std::unique_ptr<PjRtDevice>>* devices,
     GpuExecutableRunOptions* gpu_executable_run_options) {
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
@@ -286,8 +286,8 @@ Status BuildDistributedDevices(
 GpuDevice::GpuDevice(int id,
                      std::unique_ptr<LocalDeviceState> local_device_state,
                      std::string device_kind, int node_id)
-    : Device(id, std::move(local_device_state), kGpuPlatformName,
-             std::move(device_kind), node_id) {}
+    : PjRtDevice(id, std::move(local_device_state), kGpuPlatformName,
+                 std::move(device_kind), node_id) {}
 
 StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
@@ -302,7 +302,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
   auto host_memory_allocator =
       GetGpuHostAllocator(local_device_states.front()->executor());
 
-  std::vector<std::unique_ptr<Device>> devices;
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
   auto gpu_run_options = absl::make_unique<GpuExecutableRunOptions>();
   if (distributed_client) {
     TF_RETURN_IF_ERROR(BuildDistributedDevices(
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
index bf59ddef3a9..4f22a169bd8 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace xla {
 
-class GpuDevice : public Device {
+class GpuDevice : public PjRtDevice {
  public:
   GpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state,
             std::string device_kind, int node_id);
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index c5dce4a37f7..099c7729679 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -112,19 +112,19 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<LocalDeviceState*> Device::GetLocalDeviceState() const {
+StatusOr<LocalDeviceState*> PjRtDevice::GetLocalDeviceState() const {
   if (local_device_state_) {
     return local_device_state_.get();
   }
   return InvalidArgument("Device %s is not a local device.", DebugString());
 }
 
-std::string Device::DebugString() const {
+std::string PjRtDevice::DebugString() const {
   return absl::StrCat(platform_name(), ":", id());
 }
 
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
-    absl::Span<const std::vector<Device*>> devices) {
+    absl::Span<const std::vector<PjRtDevice*>> devices) {
   if (devices.empty()) {
     return InvalidArgument(
         "Device assignment passed to Compile() must be non-empty.");
@@ -175,7 +175,7 @@ class CpuAllocator : public tensorflow::Allocator {
 
 PjRtClient::PjRtClient(
     std::string platform_name, LocalClient* client,
-    std::vector<std::unique_ptr<Device>> devices, int host_id,
+    std::vector<std::unique_ptr<PjRtDevice>> devices, int host_id,
     std::unique_ptr<se::DeviceMemoryAllocator> allocator,
     std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
@@ -201,7 +201,7 @@ PjRtClient::PjRtClient(
     host_memory_allocator_ = std::make_unique<CpuAllocator>();
   }
 
-  for (const std::unique_ptr<Device>& device : devices_) {
+  for (const std::unique_ptr<PjRtDevice>& device : devices_) {
     CHECK(id_to_device_.insert({device->id(), device.get()}).second)
         << "Duplicate device id: " << device->id();
 
@@ -376,8 +376,9 @@ void RecordUsage(PjRtBuffer::ScopedHold device_buffer,
 // It is safe to delete the returned PjRtBuffer without further
 // synchronization if an error occurs before the buffer is used.
 StatusOr<std::unique_ptr<PjRtBuffer>> AllocateDestinationBuffer(
-    const Shape& on_host_shape, Device* device, LocalDeviceState* local_device,
-    se::Stream* copy_stream, bool is_uninitialized_create, PjRtClient* client) {
+    const Shape& on_host_shape, PjRtDevice* device,
+    LocalDeviceState* local_device, se::Stream* copy_stream,
+    bool is_uninitialized_create, PjRtClient* client) {
   if (on_host_shape.IsTuple() && on_host_shape.tuple_shapes_size() == 0) {
     return InvalidArgument("Can't make a buffer from an empty tuple");
   }
@@ -574,7 +575,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
     const void* data, const Shape& shape,
     HostBufferSemantics host_buffer_semantics,
     std::shared_ptr<void> buffer_reference, PjRtClient* client,
-    Device* device) {
+    PjRtDevice* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostBuffer");
   VLOG(2) << "PjRtBuffer::FromHostBuffer: shape: " << shape.ToString()
           << " device: " << device->DebugString();
@@ -736,7 +737,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostBuffer(
 
 /* static */
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CreateUninitialized(
-    const Shape& shape, PjRtClient* client, Device* device) {
+    const Shape& shape, PjRtClient* client, PjRtDevice* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::CreateUninitialized");
   VLOG(2) << "PjRtBuffer::CreateUninitialized: shape: " << shape.ToString()
           << " device: " << device->DebugString();
@@ -755,7 +756,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CreateUninitialized(
 
 /* static */
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
-    const LiteralSlice& literal, PjRtClient* client, Device* device) {
+    const LiteralSlice& literal, PjRtClient* client, PjRtDevice* device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::FromHostLiteral");
   VLOG(2) << "PjRtBuffer::FromHostLiteral: shape: "
           << literal.shape().ToString() << " device: " << device->DebugString();
@@ -815,7 +816,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
 }
 
 /*static*/ void PjRtBuffer::MakeCrossHostReceiveBuffers(
-    absl::Span<const Shape> shapes, PjRtClient* client, Device* device,
+    absl::Span<const Shape> shapes, PjRtClient* client, PjRtDevice* device,
     PjRtCrossHostRecvNotifier&& notifier) {
   if (shapes.empty()) {
     notifier(InvalidArgument(
@@ -849,7 +850,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
 
 PjRtBuffer::PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
                        std::shared_ptr<TrackedDeviceBuffer> device_buffer,
-                       PjRtClient* client, Device* device)
+                       PjRtClient* client, PjRtDevice* device)
     : client_(client),
       on_host_shape_(std::move(on_host_shape)),
       on_device_shape_(std::move(on_device_shape)),
@@ -1189,7 +1190,7 @@ PjRtBuffer::ScopedHold PjRtBuffer::GetBufferWithHold(ScopedHold::Type type) {
 StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                    std::shared_ptr<BufferSequencingEvent>>>
 PjRtBuffer::CopyToDeviceHelper(
-    Device* dst_device, LocalDeviceState* dst_local_device,
+    PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
     LocalDeviceState* transfer_local_device, se::Stream* transfer_stream,
     std::shared_ptr<TrackedDeviceBuffer> src_device_buffer) {
   TF_ASSIGN_OR_RETURN(
@@ -1249,7 +1250,7 @@ PjRtBuffer::CopyToDeviceHelper(
 }
 
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CopyToDevice(
-    Device* dst_device) {
+    PjRtDevice* dst_device) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::CopyToDevice");
   if (dst_device == device_) {
     return InvalidArgument(
@@ -1342,8 +1343,6 @@ namespace {
 // Helper struct for the tuple that is transiently constructed to hold the
 // arguments of an execution.
 struct TupleHandle {
-  // The tuple's shape on the host.
-  Shape on_host_shape;
   // The ExecutionInput describing the tuple.
   ExecutionInput execution_input;
   // A definition event that has been recorded on the host_to_device stream
@@ -1414,8 +1413,7 @@ StatusOr<TupleHandle> MakeTupleHelper(
 
   auto transfer_event = std::make_shared<BufferSequencingEvent>();
   transfer_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
-  return TupleHandle({std::move(on_host_shape), std::move(execution_input),
-                      std::move(transfer_event)});
+  return TupleHandle({std::move(execution_input), std::move(transfer_event)});
 }
 
 // Converts a ScopedShapedBuffer returned from an execution into a
@@ -1423,20 +1421,20 @@ StatusOr<TupleHandle> MakeTupleHelper(
 std::unique_ptr<PjRtBuffer> OutputBufferHelper(
     ScopedShapedBuffer* result_buffer,
     std::shared_ptr<BufferSequencingEvent> definition_event, PjRtClient* client,
-    Device* device, LocalDeviceState* local_device) {
+    PjRtDevice* device, LocalDeviceState* local_device) {
   std::shared_ptr<TrackedDeviceBuffer> out_buffer =
       TrackedDeviceBuffer::FromScopedShapedBuffer(result_buffer,
                                                   {definition_event});
-  auto py_buffer = absl::make_unique<PjRtBuffer>(
+  auto pjrt_buffer = absl::make_unique<PjRtBuffer>(
       result_buffer->on_host_shape(), result_buffer->on_device_shape(),
       std::move(out_buffer), client, device);
-  RecordUsage(py_buffer->GetBufferWithUsageHold(), local_device, local_device,
+  RecordUsage(pjrt_buffer->GetBufferWithUsageHold(), local_device, local_device,
               definition_event, local_device->compute_stream(),
               /*prefer_to_retain_reference=*/false);
-  return py_buffer;
+  return pjrt_buffer;
 }
 
-static Device* LookupDevice(const PjRtClient& client, int device_id) {
+static PjRtDevice* LookupDevice(const PjRtClient& client, int device_id) {
   auto it = client.id_to_device().find(device_id);
   CHECK(it != client.id_to_device().end())
       << "Unknown device id: " << device_id;
@@ -1450,7 +1448,7 @@ PjRtExecutable::PjRtExecutable(
     bool parameter_is_tupled_arguments,
     std::shared_ptr<DeviceAssignment> device_assignment,
     std::vector<std::pair<int, int>> local_logical_device_ids,
-    std::vector<Device*> local_devices, PjRtClient* client)
+    std::vector<PjRtDevice*> local_devices, PjRtClient* client)
     : client_(client),
       device_assignment_(std::move(device_assignment)),
       parameter_is_tupled_arguments_(parameter_is_tupled_arguments),
@@ -1508,15 +1506,64 @@ const std::string& PjRtExecutable::name() const {
   }
 }
 
+bool PjRtExecutable::MustDonateParameter(int executable_idx,
+                                         int parameter) const {
+  return parameters_that_must_be_donated_[executable_idx].contains(parameter);
+}
+
+StatusOr<std::vector<ExecutionInput>>
+PjRtExecutable::MakeExecutionInputsAndWaitForEvents(
+    int device_ordinal, const ExecuteOptions& options,
+    absl::Span<PjRtBuffer* const> argument_handles,
+    absl::Span<const PjRtBuffer::ScopedHold> device_buffers,
+    absl::flat_hash_set<BufferSequencingEvent*>& events) const {
+  std::vector<ExecutionInput> execution_inputs;
+  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
+  // Lift tuple_handle outside the conditional so that the event it returns is
+  // not destroyed until after the loop below that waits on events.
+  absl::optional<TupleHandle> tuple_handle;
+  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
+    TF_ASSIGN_OR_RETURN(tuple_handle,
+                        MakeTupleHelper(client_, device_state, argument_handles,
+                                        device_buffers, device_ordinal));
+    events.insert(tuple_handle->event.get());
+    execution_inputs.emplace_back(std::move(tuple_handle->execution_input));
+  } else {
+    execution_inputs.reserve(argument_handles.size());
+    for (int i = 0; i < argument_handles.size(); ++i) {
+      PjRtBuffer* handle = argument_handles[i];
+
+      // Make an ExecutionInput from the device buffer.
+      execution_inputs.emplace_back(handle->on_device_shape(),
+                                    handle->on_host_shape());
+      ExecutionInput& execution_input = execution_inputs.back();
+      ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
+          execution_input.MutableBuffers()->begin();
+      ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
+          execution_input.MutableBuffers()->end();
+      device_buffers[i].AddToInput(&input_iterator, iterator_end,
+                                   &execution_input, client_->allocator());
+      CHECK(input_iterator == iterator_end);
+    }
+  }
+
+  for (BufferSequencingEvent* event : events) {
+    event->WaitForEventOnStream(device_state->compute_stream());
+  }
+
+  return execution_inputs;
+}
+
 // Enqueues a computation onto the compute stream. Each buffer returned in
 // device_buffers has a usage hold added that must be dropped on error or
 // converted on success.
 StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
-    Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers,
+    PjRtDevice* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers,
     std::shared_ptr<DeviceAssignment> device_assignment) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
+  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
   tensorflow::profiler::TraceMeConsumer activity(
       "LocalExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
       run_id.ToInt());
@@ -1524,10 +1571,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
           << " mapped to device ordinal for execution: " << device_ordinal;
 
   absl::flat_hash_set<BufferSequencingEvent*> events;
-  std::vector<ExecutionInput> execution_inputs;
   device_buffers->reserve(argument_handles.size());
-  const absl::flat_hash_set<int>& parameters_that_must_be_donated =
-      parameters_that_must_be_donated_[executable_idx];
   for (int i = 0; i < argument_handles.size(); ++i) {
     PjRtBuffer* handle = argument_handles[i];
     if (handle->device() != device) {
@@ -1536,8 +1580,7 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
           "device %s, but replica is assigned to device %s.",
           i, replica, handle->device()->DebugString(), device->DebugString());
     }
-    bool must_donate = parameters_that_must_be_donated.find(i) !=
-                       parameters_that_must_be_donated.end();
+    bool must_donate = MustDonateParameter(executable_idx, i);
     device_buffers->emplace_back(handle->GetBufferWithHold(
         must_donate ? PjRtBuffer::ScopedHold::kDonation
                     : PjRtBuffer::ScopedHold::kUsage));
@@ -1571,37 +1614,10 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     }
   }
 
-  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
-  absl::optional<TupleHandle> tuple_handle;
-  if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
-    TF_ASSIGN_OR_RETURN(tuple_handle,
-                        MakeTupleHelper(client_, device_state, argument_handles,
-                                        *device_buffers, device_ordinal));
-    events.insert(tuple_handle->event.get());
-    execution_inputs.emplace_back(std::move(tuple_handle->execution_input));
-  } else {
-    execution_inputs.reserve(argument_handles.size());
-    for (int i = 0; i < argument_handles.size(); ++i) {
-      PjRtBuffer* handle = argument_handles[i];
-
-      const PjRtBuffer::ScopedHold& device_buffer = (*device_buffers)[i];
-      // Make an ExecutionInput from the device buffer.
-      execution_inputs.emplace_back(handle->on_device_shape(),
-                                    handle->on_host_shape());
-      ExecutionInput& execution_input = execution_inputs.back();
-      ShapeTree<MaybeOwningDeviceMemory>::iterator input_iterator =
-          execution_input.MutableBuffers()->begin();
-      ShapeTree<MaybeOwningDeviceMemory>::iterator iterator_end =
-          execution_input.MutableBuffers()->end();
-      device_buffer.AddToInput(&input_iterator, iterator_end, &execution_input,
-                               client_->allocator());
-      CHECK(input_iterator == iterator_end);
-    }
-  }
-
-  for (BufferSequencingEvent* event : events) {
-    event->WaitForEventOnStream(device_state->compute_stream());
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ExecutionInput> execution_inputs,
+      MakeExecutionInputsAndWaitForEvents(
+          device_ordinal, options, argument_handles, *device_buffers, events));
 
   ExecutableRunOptions run_options;
   run_options.set_stream(device_state->compute_stream());
@@ -1676,11 +1692,45 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
   return result_buffer_or_status.ConsumeValueOrDie().ConsumeResult();
 }
 
+std::vector<std::unique_ptr<PjRtBuffer>> PjRtExecutable::MakeOutputBuffers(
+    int device_ordinal, const ExecuteOptions& options,
+    ScopedShapedBuffer result_buffer,
+    std::shared_ptr<BufferSequencingEvent> definition_event,
+    PjRtDevice* device) const {
+  std::vector<std::unique_ptr<PjRtBuffer>> outputs;
+  LocalDeviceState* device_state = &client_->device_state(device_ordinal);
+  if (options.untuple_result && result_buffer.on_host_shape().IsTuple()) {
+    int tuple_count = result_buffer.on_host_shape().tuple_shapes_size();
+    outputs.reserve(tuple_count);
+    // Take ownership of each of the output values, leaving only the root table
+    // in result_buffer.
+    for (int i = 0; i < tuple_count; ++i) {
+      ScopedShapedBuffer tuple_buffer = result_buffer.TakeSubTree({i});
+      outputs.push_back(OutputBufferHelper(&tuple_buffer, definition_event,
+                                           client_, device, device_state));
+    }
+    if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
+      // Don't release the root buffer until after execution completes.
+      ShapedBuffer root_buffer_holder = result_buffer.release();
+      se::DeviceMemoryBase root_buffer = root_buffer_holder.root_buffer();
+      device_state->ThenExecuteOnCallbackThread(
+          device_state->compute_stream(),
+          [root_buffer, allocator{client_->allocator()}, device_ordinal]() {
+            TF_CHECK_OK(allocator->Deallocate(device_ordinal, root_buffer));
+          });
+    }
+  } else {
+    outputs.push_back(OutputBufferHelper(&result_buffer, definition_event,
+                                         client_, device, device_state));
+  }
+  return outputs;
+}
+
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
                               int replica, int partition, const RunId& run_id,
                               const ExecuteOptions& options,
-                              Device* device) const {
+                              PjRtDevice* device) const {
   std::shared_ptr<DeviceAssignment> device_assignment;
   if (device == nullptr) {
     CHECK(device_assignment_ != nullptr);
@@ -1737,31 +1787,9 @@ PjRtExecutable::ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
   }
   auto definition_event = std::make_shared<BufferSequencingEvent>();
   definition_event->SetSequencingEvent(event_or.ConsumeValueOrDie(), stream);
-  std::vector<std::unique_ptr<PjRtBuffer>> outputs;
-  if (options.untuple_result && result_buffer.on_host_shape().IsTuple()) {
-    int tuple_count = result_buffer.on_host_shape().tuple_shapes_size();
-    outputs.reserve(tuple_count);
-    // Take ownership of each of the output values, leaving only the root table
-    // in result_buffer.
-    for (int i = 0; i < tuple_count; ++i) {
-      ScopedShapedBuffer tuple_buffer = result_buffer.TakeSubTree({i});
-      outputs.push_back(OutputBufferHelper(&tuple_buffer, definition_event,
-                                           client_, device, device_state));
-    }
-    if (device_state->allocation_model() == LocalDeviceState::kSynchronous) {
-      // Don't release the root buffer until after execution completes.
-      ShapedBuffer root_buffer_holder = result_buffer.release();
-      se::DeviceMemoryBase root_buffer = root_buffer_holder.root_buffer();
-      device_state->ThenExecuteOnCallbackThread(
-          device_state->compute_stream(),
-          [root_buffer, allocator{client_->allocator()}, device_ordinal]() {
-            TF_CHECK_OK(allocator->Deallocate(device_ordinal, root_buffer));
-          });
-    }
-  } else {
-    outputs.push_back(OutputBufferHelper(&result_buffer, definition_event,
-                                         client_, device, device_state));
-  }
+  std::vector<std::unique_ptr<PjRtBuffer>> outputs =
+      MakeOutputBuffers(device_ordinal, options, std::move(result_buffer),
+                        definition_event, device);
 
   for (PjRtBuffer::ScopedHold& b : device_buffers) {
     // prefer_to_retain_reference=false because when using the
@@ -1801,7 +1829,7 @@ StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> PjRtExecutable::Execute(
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtExecutable::ExecuteOnLocalDevice(
-    absl::Span<PjRtBuffer* const> argument_handles, Device* device,
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options) const {
   if (device_assignment_ == nullptr) {
     VLOG(1) << "Executing portable single-core program on "
@@ -1867,7 +1895,7 @@ PjRtExecutable::ExecuteOnLocalDevices(
     for (int i = 0; i < num_local_devices; ++i) {
       const int replica = local_logical_device_ids_[i].first;
       const int partition = local_logical_device_ids_[i].second;
-      Device* device = local_devices_[i];
+      PjRtDevice* device = local_devices_[i];
       const LocalDeviceState& device_state = *device->local_device_state();
       device_state.execute_thread()->Schedule([&, replica, partition, i] {
         results[i] = ExecuteHelper(argument_handles[i], replica, partition,
@@ -2114,12 +2142,12 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
   build_options.set_result_layout(result_layout);
 
   std::vector<std::pair<int, int>> local_logical_device_ids;
-  std::vector<Device*> local_devices;
+  std::vector<PjRtDevice*> local_devices;
   if (device_assignment != nullptr) {
     for (int replica = 0; replica < num_replicas; ++replica) {
       for (int partition = 0; partition < num_partitions; ++partition) {
         int device_id = (*device_assignment)(replica, partition);
-        Device* device = LookupDevice(*client, device_id);
+        PjRtDevice* device = LookupDevice(*client, device_id);
         if (device->host_id() != client->host_id()) {
           VLOG(3) << "Non-local device: " << device_id;
           continue;
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index bb9093a8bf7..1bed959e3e6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -52,17 +52,18 @@ namespace xla {
 
 class PjRtClient;
 
-class Device {
+class PjRtDevice {
  public:
-  explicit Device(int id, std::unique_ptr<LocalDeviceState> local_device_state,
-                  std::string platform_name, std::string device_kind,
-                  int host_id = 0)
+  explicit PjRtDevice(int id,
+                      std::unique_ptr<LocalDeviceState> local_device_state,
+                      std::string platform_name, std::string device_kind,
+                      int host_id = 0)
       : id_(id),
         local_device_state_(std::move(local_device_state)),
         host_id_(host_id),
         platform_name_(std::move(platform_name)),
         device_kind_(std::move(device_kind)) {}
-  virtual ~Device() {}
+  virtual ~PjRtDevice() {}
 
   // The ID of this device. IDs are unique among devices of this type
   // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
@@ -130,7 +131,7 @@ class PjRtClient {
   // `allocator` may null, in which case the platform default allocator is used.
   explicit PjRtClient(
       std::string platform_name, LocalClient* client,
-      std::vector<std::unique_ptr<Device>> devices, int host_id,
+      std::vector<std::unique_ptr<PjRtDevice>> devices, int host_id,
       std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
@@ -142,11 +143,15 @@ class PjRtClient {
 
   int device_count() const { return devices_.size(); }
   int local_device_count() const { return local_devices_.size(); }
-  const std::vector<std::unique_ptr<Device>>& devices() const {
+  const std::vector<std::unique_ptr<PjRtDevice>>& devices() const {
     return devices_;
   }
-  const std::vector<Device*>& local_devices() const { return local_devices_; }
-  const std::map<int, Device*>& id_to_device() const { return id_to_device_; }
+  const std::vector<PjRtDevice*>& local_devices() const {
+    return local_devices_;
+  }
+  const std::map<int, PjRtDevice*>& id_to_device() const {
+    return id_to_device_;
+  }
   int host_id() const { return host_id_; }
   const std::string& platform_name() const { return platform_name_; }
 
@@ -210,11 +215,11 @@ class PjRtClient {
   std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
-  std::vector<std::unique_ptr<Device>> devices_;
+  std::vector<std::unique_ptr<PjRtDevice>> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
-  std::map<int, Device*> id_to_device_;
+  std::map<int, PjRtDevice*> id_to_device_;
   // Local devices indexed by local device ordinal.
-  std::vector<Device*> local_devices_;
+  std::vector<PjRtDevice*> local_devices_;
   int host_id_;
 
   se::DeviceMemoryAllocator* allocator_;
@@ -233,7 +238,7 @@ class PjRtClient {
 // Converts a 2D set of Device objects indexed by [replica][partition] into an
 // xla::DeviceAssignment.
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
-    absl::Span<const std::vector<Device*>> devices);
+    absl::Span<const std::vector<PjRtDevice*>> devices);
 
 // Holds a reference from Python to a tuple of device buffers. A PjRtBuffer
 // can be either valid or invalid. An invalid buffer is one that has never been
@@ -417,7 +422,7 @@ class PjRtBuffer {
 
   // Returns a buffer with uninitialized contents.
   static StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitialized(
-      const Shape& shape, PjRtClient* client, Device* device);
+      const Shape& shape, PjRtClient* client, PjRtDevice* device);
 
   // Describes the semantics the caller to FromHostBuffer expects from the
   // runtime, in a total order from most restrictive to least restrictive.
@@ -449,13 +454,13 @@ class PjRtBuffer {
       const void* data, const Shape& shape,
       HostBufferSemantics host_buffer_semantics,
       std::shared_ptr<void> buffer_reference, PjRtClient* client,
-      Device* device);
+      PjRtDevice* device);
 
   // Note that literal must remain in scope until the transfer has completed, so
   // the caller should, for example, wait for BlockHostUntilReady() completes on
   // the return value before letting literal go out of scope.
   static StatusOr<std::unique_ptr<PjRtBuffer>> FromHostLiteral(
-      const LiteralSlice& literal, PjRtClient* client, Device* device);
+      const LiteralSlice& literal, PjRtClient* client, PjRtDevice* device);
 
   // Asynchronously makes a vector of PjRtBuffers that can be used to receive
   // cross host transfers using `client` on `device'. `shapes` must be the exact
@@ -467,12 +472,13 @@ class PjRtBuffer {
   // sending host and used in a call to CopyToRemoteDevice. None of the recv
   // buffers will become ready until *all* of the sends have completed.
   static void MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                                          PjRtClient* client, Device* device,
+                                          PjRtClient* client,
+                                          PjRtDevice* device,
                                           PjRtCrossHostRecvNotifier&& notifier);
 
   PjRtBuffer(Shape on_host_shape, Shape on_device_shape,
              std::shared_ptr<TrackedDeviceBuffer> device_buffer,
-             PjRtClient* client, Device* device);
+             PjRtClient* client, PjRtDevice* device);
   ~PjRtBuffer();
 
   PjRtBuffer(const PjRtBuffer&) = delete;
@@ -482,7 +488,7 @@ class PjRtBuffer {
 
   const Shape& on_host_shape() const { return on_host_shape_; }
   const Shape& on_device_shape() const { return on_device_shape_; }
-  Device* device() const { return device_; }
+  PjRtDevice* device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
   PjRtClient* client() const { return client_; }
   bool IsEmptyTuple() const {
@@ -556,7 +562,7 @@ class PjRtBuffer {
 
   // Copies the buffer to device `dst_device`. Returns an error if the buffer is
   // already on dst_device.
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(Device* dst_device);
+  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(PjRtDevice* dst_device);
 
   // Copies the buffer to the remote device encoded in serialized_descriptor.
   // This call must be preceded by a call to MakeCrossHostReceiveBuffers on the
@@ -629,7 +635,7 @@ class PjRtBuffer {
 
   StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
                      std::shared_ptr<BufferSequencingEvent>>>
-  CopyToDeviceHelper(Device* dst_device, LocalDeviceState* dst_local_device,
+  CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
                      LocalDeviceState* transfer_local_device,
                      se::Stream* transfer_stream,
                      std::shared_ptr<TrackedDeviceBuffer> src_device_buffer);
@@ -637,7 +643,7 @@ class PjRtBuffer {
   PjRtClient* const client_;
   const Shape on_host_shape_;
   const Shape on_device_shape_;
-  Device* const device_;
+  PjRtDevice* const device_;
 
   mutable absl::Mutex mu_;
   std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
@@ -668,6 +674,11 @@ struct CompileOptions {
   bool compile_portable_executable = false;
 };
 
+class ExecuteContext {
+ public:
+  virtual ~ExecuteContext() = default;
+};
+
 struct ExecuteOptions {
   // If true, the client must pass a single PjRtBuffer which contains all of
   // the arguments as a single XLA tuple, otherwise each argument must be
@@ -682,6 +693,9 @@ struct ExecuteOptions {
   // multi-host programs are launched in different orders on different hosts,
   // the launch IDs may be used by the runtime to detect the mismatch.
   int32 launch_id = 0;
+  // If non-null, an opaque context passed to an execution that may be used to
+  // supply additional arguments to a derived class of PjRtExecutable.
+  ExecuteContext* context = nullptr;
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -699,7 +713,7 @@ class PjRtExecutable {
                  bool parameter_is_tupled_arguments,
                  std::shared_ptr<DeviceAssignment> device_assignment,
                  std::vector<std::pair<int, int>> local_logical_device_ids,
-                 std::vector<Device*> local_devices, PjRtClient* client);
+                 std::vector<PjRtDevice*> local_devices, PjRtClient* client);
 
   virtual ~PjRtExecutable() = default;
 
@@ -733,14 +747,16 @@ class PjRtExecutable {
     return local_logical_device_ids_;
   }
 
-  const std::vector<Device*>& local_devices() const { return local_devices_; }
+  const std::vector<PjRtDevice*>& local_devices() const {
+    return local_devices_;
+  }
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> Execute(
       absl::Span<PjRtBuffer* const> argument_handles,
       const ExecuteOptions& options) const;
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteOnLocalDevice(
-      absl::Span<PjRtBuffer* const> argument_handles, Device* device,
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options) const;
 
   // Execute on local devices. Takes a sequence of argument lists (one argument
@@ -756,22 +772,42 @@ class PjRtExecutable {
 
   const string& name() const;
 
+ protected:
+  bool parameter_is_tupled_arguments() const {
+    return parameter_is_tupled_arguments_;
+  }
+
  private:
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
   Status SetUpDonation(PjRtClient* client, bool tuple_inputs);
 
+  virtual bool MustDonateParameter(int executable_idx, int parameter) const;
+
+  virtual StatusOr<std::vector<ExecutionInput>>
+  MakeExecutionInputsAndWaitForEvents(
+      int device_ordinal, const ExecuteOptions& options,
+      absl::Span<PjRtBuffer* const> argument_handles,
+      absl::Span<const PjRtBuffer::ScopedHold> device_buffers,
+      absl::flat_hash_set<BufferSequencingEvent*>& events) const;
+
   StatusOr<ScopedShapedBuffer> EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
-      const ExecuteOptions& options, Device* device,
+      const ExecuteOptions& options, PjRtDevice* device,
       std::vector<PjRtBuffer::ScopedHold>* device_buffers,
       std::shared_ptr<DeviceAssignment> device_assignment) const;
 
+  virtual std::vector<std::unique_ptr<PjRtBuffer>> MakeOutputBuffers(
+      int device_ordinal, const ExecuteOptions& options,
+      ScopedShapedBuffer result_buffer,
+      std::shared_ptr<BufferSequencingEvent> definition_event,
+      PjRtDevice* device) const;
+
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, const RunId& run_id, const ExecuteOptions& options,
-      Device* device = nullptr) const;
+      PjRtDevice* device = nullptr) const;
 
   // Create shared pointers so we can free them after the execution: with
   // asynchronous execution, the process being executed can outlive the
@@ -800,7 +836,7 @@ class PjRtExecutable {
   // assigned.
   // shared_ptrs instead of unique_ptrs to play well with the Python bindings
   // (see xla.cc).
-  std::vector<Device*> local_devices_;
+  std::vector<PjRtDevice*> local_devices_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 1330dca6402..046fadb405b 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -155,7 +155,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core/lib/bfloat16",
+        "//tensorflow/core/platform:bfloat16",
         "//tensorflow/core/platform:logging",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
@@ -242,6 +242,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "jax_jit",
+    srcs = ["jax_jit.cc"],
+    hdrs = ["jax_jit.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":py_client",
+        ":pytree",
+        ":types",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/types:optional",
+        "@pybind11",
+    ],
+)
+
 cc_library(
     name = "ops",
     srcs = ["ops.cc"],
@@ -367,6 +394,7 @@ pybind_extension(
     deps = [
         ":bfloat16",
         ":dlpack",
+        ":jax_jit",
         ":ops",
         ":py_client",
         ":pytree",
diff --git a/tensorflow/compiler/xla/python/bfloat16.cc b/tensorflow/compiler/xla/python/bfloat16.cc
index 1f21b3fb242..b70244cc3ef 100644
--- a/tensorflow/compiler/xla/python/bfloat16.cc
+++ b/tensorflow/compiler/xla/python/bfloat16.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 4fc17172ea7..974816407ee 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -193,7 +193,7 @@ StatusOr<std::vector<int64>> StridesToLayout(absl::Span<int64 const> dims,
   return minor_to_major;
 }
 
-StatusOr<DLDeviceType> DLDeviceTypeForDevice(const Device& device) {
+StatusOr<DLDeviceType> DLDeviceTypeForDevice(const PjRtDevice& device) {
   const se::Platform* platform =
       device.local_device_state()->executor()->platform();
   if (platform->id() == se::host::kHostPlatformId) {
@@ -205,15 +205,15 @@ StatusOr<DLDeviceType> DLDeviceTypeForDevice(const Device& device) {
                          device.DebugString());
 }
 
-StatusOr<DLContext> DLContextForDevice(const Device& device) {
+StatusOr<DLContext> DLContextForDevice(const PjRtDevice& device) {
   DLContext context;
   TF_ASSIGN_OR_RETURN(context.device_type, DLDeviceTypeForDevice(device));
   context.device_id = device.local_device_state()->device_ordinal();
   return context;
 }
 
-StatusOr<Device*> DeviceForDLContext(const PjRtClient& client,
-                                     const DLContext& context) {
+StatusOr<PjRtDevice*> DeviceForDLContext(const PjRtClient& client,
+                                         const DLContext& context) {
   se::Platform::Id platform_id;
   switch (context.device_type) {
     case kDLCPU:
@@ -226,7 +226,7 @@ StatusOr<Device*> DeviceForDLContext(const PjRtClient& client,
       return InvalidArgument("Unknown/unsupported DLPack device type %d",
                              context.device_type);
   }
-  auto it = absl::c_find_if(client.local_devices(), [&](Device* device) {
+  auto it = absl::c_find_if(client.local_devices(), [&](PjRtDevice* device) {
     return device->local_device_state()->executor()->platform()->id() ==
                platform_id &&
            device->local_device_state()->device_ordinal() == context.device_id;
@@ -313,7 +313,7 @@ StatusOr<std::unique_ptr<PyBuffer>> DLPackManagedTensorToBuffer(
         dlmt->dl_tensor.ndim);
   }
   TF_ASSIGN_OR_RETURN(
-      Device * device,
+      PjRtDevice * device,
       DeviceForDLContext(*client->pjrt_client(), dlmt->dl_tensor.ctx));
   absl::Span<int64 const> dimensions(
       reinterpret_cast<int64*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
new file mode 100644
index 00000000000..96cf1e64b85
--- /dev/null
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -0,0 +1,708 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This files implements the `jax.jit` dispatch and just-in-time feature.
+//
+// In a nutshell, `Jit(f)` returns a callable that will dispatch (i.e. forward
+// based on passed arguments dtypes/shapes/identity) the execution to a
+// just-in-time compiled XLA Executable. All of that is done in C++ for
+// performance reasons.
+//
+// This file contains the utilities to:
+// (a) inspect arguments and describe their structure, dtype/shapes, etc.
+// (b) keep a mapping from function signatures to compiled XLA Executables.
+
+#include "tensorflow/compiler/xla/python/jax_jit.h"
+
+#include <memory>
+#include <stdexcept>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/types/optional.h"
+#include "pybind11/cast.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/py_buffer.h"
+#include "tensorflow/compiler/xla/python/py_executable.h"
+#include "tensorflow/compiler/xla/python/pytree.h"
+#include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+// TODO(phawkins): Add support for Tracers.
+// TODO(jblespiau): Add support for donate_argnums.
+// TODO(jblespiau): Use absl Status.
+
+namespace {
+
+// Describes the abstract shape and dtype of an argument.
+struct ArgSignature {
+  // This is the XLA dtype of the object.
+  xla::PrimitiveType dtype;
+  // JAX arguments can be of weak type, if and only if they are Python scalars
+  // or `DeviceArray` values such that `aval.weak_type` is true.
+  bool weak_type;
+  absl::InlinedVector<int64, 4> shape;
+  bool operator==(const ArgSignature& other) const {
+    return std::tie(dtype, weak_type, shape) ==
+           std::tie(other.dtype, other.weak_type, other.shape);
+  }
+  bool operator!=(const ArgSignature& other) const { return !(*this == other); }
+
+  std::string DebugString() const {
+    std::string result = "";
+    if (weak_type) {
+      absl::StrAppend(&result, "weak_");
+    }
+    absl::StrAppend(&result, xla::PrimitiveType_Name(dtype));
+    absl::StrAppend(&result, "[", absl::StrJoin(shape, ","), "]");
+    return result;
+  }
+};
+
+template <typename H>
+H AbslHashValue(H h, const ArgSignature& s) {
+  h = H::combine(std::move(h), s.dtype);
+  if (!s.shape.empty()) {
+    h = H::combine_contiguous(std::move(h), &s.shape.front(), s.shape.size());
+  }
+  return h;
+}
+
+// The signature of Python jitted function call, partitioned into:
+// - dynamic positional arguments (i.e. positional args which are not static)
+// - static positional arguments (i.e. the args associated to static_argnums)
+// - keyword arguments
+// The CallSignature should unambiguously identify a function call, thus,
+// equality is based on:
+// (a) Same PyTree for all dynamic positional arguments and keyword arguments
+// (a) equality of the arguments and keyword arguments ArgSignature
+// (a) equality (delegated to Python) of the static arguments.
+struct CallSignature {
+  struct KwargEntry {
+    // To avoid comparing strings, we intern the kwargs strings.
+    // The compilation cache holds a reference to all the keys.
+    py::handle key;
+    PyTreeDef value_treedef;
+    bool operator==(const KwargEntry& other) const {
+      return key.ptr() == other.key.ptr() &&
+             value_treedef == other.value_treedef;
+    }
+    bool operator!=(const KwargEntry& other) const { return !(*this == other); }
+  };
+
+  // Only contains the arguments associated to `static_argnums`, sorted in the
+  // order of their argnum index.
+  std::vector<py::object> static_args;
+  // A PyTreeDef for each positional dynamic (i.e. not static) argument.
+  std::vector<PyTreeDef> dynamic_positional_args_treedef;
+  // Keyword arguments. Sorted by the interned keyword pointers.
+  std::vector<KwargEntry> keyword_args;
+  // Shape and dtype for both the dynamic positional arguments and the keyword
+  // arguments (sorted by interned keyword pointers).
+  std::vector<ArgSignature> dynamic_args_signatures;
+
+  bool operator==(const CallSignature& other) const {
+    return std::tie(dynamic_positional_args_treedef, static_args, keyword_args,
+                    dynamic_args_signatures) ==
+           std::tie(other.dynamic_positional_args_treedef, other.static_args,
+                    other.keyword_args, other.dynamic_args_signatures);
+  }
+  bool operator!=(const CallSignature& other) const {
+    return !(*this == other);
+  }
+
+  // To be used when we want to keep ownership of Python values referenced by
+  // the `CallSignature` (i.e. when we insert an entry).
+  void IncRef() const;
+  // The destructor of the cache should call this on all entries.
+  void DecRef() const;
+
+  std::string DebugString() const;
+};
+
+void CallSignature::IncRef() const {
+  for (const auto& kw : keyword_args) {
+    kw.key.inc_ref();
+  }
+}
+
+void CallSignature::DecRef() const {
+  for (const auto& kw : keyword_args) {
+    kw.key.dec_ref();
+  }
+}
+
+template <typename H>
+H AbslHashValue(H h, const CallSignature::KwargEntry& kw) {
+  h = H::combine(std::move(h), kw.key.ptr(), kw.value_treedef);
+  return h;
+}
+
+template <typename H>
+H AbslHashValue(H h, const CallSignature& s) {
+  // /!\ important: We cannot include static arguments to the hash, because
+  // the py::object must be hashable for absl. We can try delegating to the
+  // Python __hash__, but there are many non-hashable Python types such as
+  // np.ndarray.
+  // TODO(jblespiau): We should either ban non-hashable objects from jit or we
+  // should hash them by object identity.
+  h = H::combine_contiguous(std::move(h),
+                            &s.dynamic_positional_args_treedef.front(),
+                            s.dynamic_positional_args_treedef.size());
+  h = H::combine_contiguous(std::move(h), &s.keyword_args.front(),
+                            s.keyword_args.size());
+  h = H::combine_contiguous(std::move(h), &s.dynamic_args_signatures.front(),
+                            s.dynamic_args_signatures.size());
+  return h;
+}
+
+std::string CallSignature::DebugString() const {
+  std::vector<std::string> static_args_str;
+  static_args_str.reserve(static_args.size());
+  for (auto& static_arg : static_args) {
+    static_args_str.emplace_back(py::cast<std::string>(static_arg.str()));
+  }
+
+  std::vector<std::string> signature_str;
+  signature_str.reserve(dynamic_args_signatures.size());
+
+  for (auto& arg_signature : dynamic_args_signatures) {
+    signature_str.emplace_back(arg_signature.DebugString());
+  }
+  std::vector<std::string> tree_def_str;
+  signature_str.reserve(dynamic_positional_args_treedef.size());
+  for (auto& tree_def : dynamic_positional_args_treedef) {
+    tree_def_str.emplace_back(tree_def.ToString());
+  }
+  std::vector<std::string> keyword_names;
+  keyword_names.reserve(keyword_args.size());
+  for (auto& kwarg_entry : keyword_args) {
+    keyword_names.emplace_back(py::cast<std::string>(kwarg_entry.key));
+    tree_def_str.emplace_back(kwarg_entry.value_treedef.ToString());
+  }
+  return absl::StrCat(
+      static_args.size(), " static_args: ", absl::StrJoin(static_args_str, ","),
+      "\n",  // new line
+      keyword_args.size(), " keyword args:", absl::StrJoin(keyword_names, ","),
+      "\n",  // new-line
+      dynamic_positional_args_treedef.size(), " positional args.\n",
+      dynamic_args_signatures.size(),
+      " dynamic args (positional+keyword):\n   - ",
+      absl::StrJoin(signature_str, ", "), "\n   - ",
+      absl::StrJoin(tree_def_str, " | "));
+}
+
+struct CacheEntry {
+  std::shared_ptr<xla::PyExecutable> executable;
+  xla::PjRtDevice* device;
+  PyTreeDef out_pytree_def;
+  // These are the objects required to create a `DeviceArray` object.
+  // We use Python types within the vector because this is what we will be
+  // returning to Python. No need to convert back and forth.
+  // We need py::object to maintain the objects alive.
+  std::vector<py::object> out_avals;
+  std::vector<py::object> out_lazy_exprs;
+};
+
+// A `CompiledFunction` is associated to a `jax.jit(f)` and takes care of the
+// bookkeeping of the different signatures used and the dispatch of calls to
+// the correct underlying `PyExecutable`.
+class CompiledFunction {
+ public:
+  CompiledFunction(py::function cache_miss_fun, py::function python_f_jitted,
+                   bool jax_enable_x64, std::vector<int> static_argnums,
+                   std::shared_ptr<xla::PyClient> pyclient,
+                   xla::PjRtDevice* device);
+  ~CompiledFunction();
+
+  // This function will:
+  // (a) flatten the inputs using pytree
+  // (b) get buffer objects from the arguments
+  // (c) call the executable
+  // (d) construct `DeviceArray` objects from the outputs
+  // (e) reconstruct the `PyTree`.
+  py::object Call(py::args args, py::kwargs kwargs);
+
+ private:
+  CacheEntry& GetCacheEntry(const py::args& args, const py::kwargs& kwargs,
+                            const CallSignature& signature);
+
+  // The Python function in charge of returning a `xla::PyExecutable` from
+  // the arguments passed to `jitted_f`.
+  const py::function cache_miss_fun_;
+  // A function to call as fallback. This is the result of calling the Python
+  // `jax.jit`.
+  // TODO(jblespiau): Delete this when the C++ codepath supports all features.
+  const py::function python_f_jitted_;
+
+  // The value of the Python flag when the object was created.
+  const bool jax_enable_x64_;
+
+  // We need to know the static arguments to remove them from the arguments
+  // passed to the underlying PyExecutable. In sorted order.
+  std::vector<int> static_argnums_;
+  // We need a `unique_ptr` here to ensure value pointer stability.
+  absl::flat_hash_map<CallSignature, std::unique_ptr<CacheEntry>> executables_;
+
+  const std::shared_ptr<xla::PyClient> pyclient_;
+  xla::PjRtDevice* const default_device_;
+};
+
+CompiledFunction::CompiledFunction(py::function cache_miss_fun,
+                                   py::function python_f_jitted,
+                                   bool jax_enable_x64,
+                                   std::vector<int> static_argnums,
+                                   std::shared_ptr<xla::PyClient> pyclient,
+                                   xla::PjRtDevice* device)
+    : cache_miss_fun_(std::move(cache_miss_fun)),
+      python_f_jitted_(std::move(python_f_jitted)),
+      jax_enable_x64_(jax_enable_x64),
+      static_argnums_(std::move(static_argnums)),
+      pyclient_(std::move(pyclient)),
+      default_device_(device) {
+  std::sort(static_argnums_.begin(), static_argnums_.end());
+}
+
+CompiledFunction::~CompiledFunction() {
+  for (const auto& entry : executables_) {
+    entry.first.DecRef();
+  }
+}
+
+namespace {
+
+// The resulting information of the parsing and conversion of the arguments.
+struct ParsedArgumentsAsBuffers {
+  // The call signature will be filled during 2 steps:
+  // - `FlattenArguments` will fill the static arguments and the pytree
+  //    structures
+  // - the shapes and dtypes are filled later, by `ParseAndTransferArguments`.
+  CallSignature signature;
+  // The concatenation of the dynamic positional arguments and the sorted
+  // keyword arguments. We do not need ownership, thus the py::handle.
+  // TODO(jblespiau): We do not need py::object here and py::handle suffice and
+  // will prevent any counter increment.
+  std::vector<py::object> flat_dynamic_args;
+  std::vector<py::object> keep_alive_objects;
+
+  // The following is only valid if the parsing succeeds.
+  std::vector<xla::PjRtBuffer*> arg_buffers;
+  // We may need to keep some objects around, because:
+  // (a) we need to extend the lifetime of objects created within
+  //    `ConvertArgsToBuffers`
+  // (b) `arg_buffers` do not maintain ownership
+  std::vector<absl::variant<std::unique_ptr<xla::PyBuffer>,
+                            std::unique_ptr<xla::PjRtBuffer>>>
+      keep_alive;
+};
+
+// Filter out static arguments, flatten and concatenate other arguments (i.e.
+// dynamic positional and keyword arguments), filling `arguments` in place.
+void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs,
+                      absl::Span<int const> static_argnums,
+                      ParsedArgumentsAsBuffers& arguments) {
+  arguments.flat_dynamic_args.reserve(args.size() + py_kwargs.size() -
+                                      static_argnums.size());
+  arguments.signature.dynamic_positional_args_treedef.reserve(
+      args.size() - static_argnums.size());
+
+  // Positional arguments.
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (std::find(static_argnums.begin(), static_argnums.end(), i) ==
+        static_argnums.end()) {
+      PyTreeDef pytree_def;
+      pytree_def.FlattenInto(args[i], arguments.flat_dynamic_args);
+      arguments.signature.dynamic_positional_args_treedef.push_back(pytree_def);
+    } else {
+      arguments.signature.static_args.emplace_back(
+          // borrow is mandatory here.
+          py::reinterpret_borrow<py::object>(args[i]));
+    }
+  }
+
+  // Keyword arguments.
+  std::vector<std::pair<py::handle, py::handle>> kwargs(py_kwargs.begin(),
+                                                        py_kwargs.end());
+  // We first intern the keys, then sort them (by pointer) and then create
+  // the signatures.
+  arguments.signature.keyword_args.resize(kwargs.size());
+  for (size_t i = 0; i < kwargs.size(); ++i) {
+    // Intern the key if not already interned.
+    if (!PyUnicode_CHECK_INTERNED(kwargs[i].first.ptr())) {
+      PyObject* key = kwargs[i].first.ptr();
+      kwargs[i].first.inc_ref();
+      PyUnicode_InternInPlace(&key);
+      arguments.keep_alive_objects.push_back(
+          py::reinterpret_steal<py::object>(key));
+      kwargs[i].first = py::handle(key);
+    }
+  }
+
+  std::sort(kwargs.begin(), kwargs.end(),
+            [](const std::pair<py::handle, py::handle>& a,
+               const std::pair<py::handle, py::handle>& b) {
+              return a.first.ptr() < b.first.ptr();
+            });
+  for (size_t i = 0; i < kwargs.size(); ++i) {
+    arguments.signature.keyword_args[i].key = kwargs[i].first;
+    arguments.signature.keyword_args[i].value_treedef.FlattenInto(
+        kwargs[i].second, arguments.flat_dynamic_args);
+  }
+}
+
+template <typename CppType, typename Pybind11Type>
+std::unique_ptr<xla::PjRtBuffer> ConvertToScalarBuffer(
+    const py::handle& scalar, xla::PjRtClient* client,
+    xla::PjRtDevice* device) {
+  CppType data = py::cast<Pybind11Type>(scalar);
+  xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<CppType>({});
+  return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+      &data, shape,
+      xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+      client, device));
+}
+
+// Convert a scalar to the associated PjRtBuffer or raises an error if it is
+// not convertible (thus, this must be called after other checks).
+StatusOr<std::unique_ptr<xla::PjRtBuffer>> ScalarToBuffer(
+    py::handle scalar, bool jax_enable_x64, xla::PjRtClient* client,
+    xla::PjRtDevice* device) {
+  // Important: In Python, isinstance(True, int) returns True. Thus, we have
+  // to check for bool before int.
+  if (py::isinstance<py::bool_>(scalar)) {
+    return ConvertToScalarBuffer<bool, py::bool_>(scalar, client, device);
+  } else if (py::isinstance<py::int_>(scalar)) {
+    if (jax_enable_x64) {
+      return ConvertToScalarBuffer<int64, py::int_>(scalar, client, device);
+    } else {
+      return ConvertToScalarBuffer<int, py::int_>(scalar, client, device);
+    }
+  } else if (py::isinstance<py::float_>(scalar)) {
+    if (jax_enable_x64) {
+      return ConvertToScalarBuffer<double, py::float_>(scalar, client, device);
+
+    } else {
+      return ConvertToScalarBuffer<float, py::float_>(scalar, client, device);
+    }
+  } else if (PyComplex_Check(scalar.ptr())) {
+    Py_complex result = PyComplex_AsCComplex(scalar.ptr());
+    if (result.real == -1.0 && PyErr_Occurred()) {
+      PyErr_Clear();
+      throw std::runtime_error("Could not convert the complex number");
+    }
+    if (jax_enable_x64) {
+      xla::complex128 data(result.real, result.imag);
+      xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<xla::complex128>({});
+      return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+          &data, shape,
+          xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall,
+          nullptr, client, device));
+    } else {
+      xla::complex64 data(result.real, result.imag);
+      xla::Shape shape = xla::ShapeUtil::MakeShapeWithType<xla::complex64>({});
+      return ValueOrThrow(xla::PjRtBuffer::FromHostBuffer(
+          &data, shape,
+          xla::PjRtBuffer::HostBufferSemantics::kImmutableOnlyDuringCall,
+          nullptr, client, device));
+    }
+  }
+  return InvalidArgument(
+      "%s", absl::StrCat(
+                "Not supported: The C++ jax jit execution path, only accepts "
+                "DeviceArray, Numpy arrays, or Python scalars. Got type ",
+                py::cast<std::string>(scalar.get_type().str())));
+}
+
+const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) {
+  static const auto* int64_dt = new py::dtype("int64");
+  static const auto* int32_dt = new py::dtype("int32");
+  static const auto* uint64_dt = new py::dtype("uint64");
+  static const auto* uint32_dt = new py::dtype("uint32");
+  static const auto* float64_dt = new py::dtype("float64");
+  static const auto* float32_dt = new py::dtype("float32");
+  static const auto* complex64_dt = new py::dtype("complex64");
+  static const auto* complex128_dt = new py::dtype("complex128");
+
+  if (dtype == *int64_dt) {
+    return int32_dt;
+  }
+  if (dtype == *float64_dt) {
+    return float32_dt;
+  }
+  if (dtype == *uint64_dt) {
+    return uint32_dt;
+  }
+  if (dtype == *complex128_dt) {
+    return complex64_dt;
+  }
+
+  return nullptr;
+}
+
+// Converts flattened arguments contained in ParsedArgumentsAsBuffers in
+// place. If arguments are `DeviceArray`, they must all be on the same `Device`.
+//
+// Returns `OkStatus()` on success.
+Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
+                            xla::PjRtDevice* default_device,
+                            ParsedArgumentsAsBuffers& arguments) {
+  std::vector<xla::PjRtBuffer*>& arg_buffers = arguments.arg_buffers;
+  auto& keep_alive = arguments.keep_alive;
+
+  int num_flat_dynamic_args = arguments.flat_dynamic_args.size();
+  arg_buffers.reserve(num_flat_dynamic_args);
+  arguments.signature.dynamic_args_signatures.reserve(num_flat_dynamic_args);
+
+  static const auto* xla_module =
+      new py::module(py::module::import("jax.interpreters.xla"));
+  const auto& device_array = xla_module->attr("DeviceArray");
+
+  static const auto* numpy_module = new py::module(py::module::import("numpy"));
+  const auto& array = numpy_module->attr("array");
+
+  // TODO(phawkins): consider device stickiness.
+  // We first check whether any `DeviceArray` is present and whether they are
+  // attached to any specific device. See also
+  // https://github.com/google/jax/pull/1884
+  // https://github.com/google/jax/pull/1916 for the rationale why the
+  // computation follows the data locality.
+  // It's also similar to PyTorch's behavior.
+  xla::PjRtDevice* data_device = nullptr;
+  for (py::handle arg : arguments.flat_dynamic_args) {
+    if (py::isinstance(arg, device_array)) {
+      xla::PyBuffer* buffer =
+          py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
+      xla::PjRtDevice* device = buffer->buffer()->device();
+      if (data_device && (device != data_device)) {
+        return InvalidArgument(
+            "%s",
+            absl::StrCat(
+                "Arguments to a jit-compiled function must be colocated on the "
+                "same device. Arguments were found to be on the two following "
+                "different devices: ",
+                device->DebugString(), " and ", data_device->DebugString()));
+      } else {
+        data_device = device;
+      }
+    }
+  }
+  if (!data_device) {
+    // No `DeviceArray` were found default to `default_device`.
+    data_device = default_device;
+  }
+  xla::PjRtClient* pjrt_client = data_device->client();
+
+  for (py::handle arg : arguments.flat_dynamic_args) {
+    // We do not support here d2d transparent transfers.
+    // We assumes all the `DeviceArray` are already on the correct and shared
+    // device.
+    if (py::isinstance(arg, device_array)) {
+      xla::PyBuffer* buffer =
+          py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
+      arg_buffers.push_back(buffer->buffer());
+      ArgSignature sig;
+      sig.dtype = buffer->shape().element_type();
+      sig.shape.assign(buffer->shape().dimensions().begin(),
+                       buffer->shape().dimensions().end());
+      sig.weak_type = py::cast<py::bool_>(arg.attr("aval").attr("weak_type"));
+      arguments.signature.dynamic_args_signatures.push_back(std::move(sig));
+    } else if (py::isinstance<py::array>(arg)) {
+      // TODO(jblespiau): Can we improve this call? Do we need the underlying
+      // GlobalPyRefManager() and co?
+      py::array numpy_array = py::cast<py::array>(arg);
+      // If jax_enable_x64 is not set, we need to coerce 32 bits types.
+      // Note that this is calling back to Python!
+      // TODO(jblespiau): We can remove this complexity when we delete
+      // jax_enable_x64 mode.
+      if (!jax_enable_x64) {
+        const py::dtype* to_dtype = DtypeTo32BitDtype(numpy_array.dtype());
+        if (to_dtype) {
+          numpy_array = array(numpy_array, to_dtype);
+        }
+      }
+      std::unique_ptr<xla::PyBuffer> buffer =
+          ValueOrThrow(pyclient.BufferFromPyval(
+              numpy_array, data_device,
+              /*force_copy=*/false, /*host_buffer_semantics=*/
+              xla::PjRtBuffer::HostBufferSemantics::kZeroCopy));
+      arg_buffers.push_back(buffer->buffer());
+
+      ArgSignature sig;
+      sig.dtype = buffer->shape().element_type();
+      sig.shape.assign(buffer->shape().dimensions().begin(),
+                       buffer->shape().dimensions().end());
+      arguments.signature.dynamic_args_signatures.push_back(sig);
+
+      keep_alive.emplace_back(std::move(buffer));
+    } else {
+      StatusOr<std::unique_ptr<xla::PjRtBuffer>> buffer =
+          ScalarToBuffer(arg, jax_enable_x64, pjrt_client, data_device);
+      if (!buffer.ok()) {
+        return buffer.status();
+      }
+      arg_buffers.push_back(buffer.ValueOrDie().get());
+      ArgSignature sig;
+      sig.dtype = buffer.ValueOrDie()->on_host_shape().element_type();
+      sig.weak_type = true;
+      arguments.signature.dynamic_args_signatures.push_back(sig);
+
+      keep_alive.emplace_back(std::move(buffer).ValueOrDie());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+CacheEntry& CompiledFunction::GetCacheEntry(const py::args& args,
+                                            const py::kwargs& kwargs,
+                                            const CallSignature& signature) {
+  auto found_iterator = executables_.find(signature);
+  if (found_iterator != executables_.end()) {  // Cache hit!
+    return *(found_iterator->second);
+  }
+
+  // We need to insert the element.
+  auto result = executables_.emplace(signature, std::make_unique<CacheEntry>());
+  auto it = result.first;
+
+  // CallSignatures in the cache own their keyword argument reference.
+  result.first->first.IncRef();
+
+  // Cache miss? Call the Python cache miss function.
+  py::tuple executable_and_pytree = cache_miss_fun_(*args, **kwargs);
+  if (executable_and_pytree.size() != 4) {
+    throw std::runtime_error(
+        "AssertionError: The cache miss function should return 4 "
+        "arguments.");
+  }
+  it->second->executable = py::cast<std::shared_ptr<xla::PyExecutable>>(
+      std::move(executable_and_pytree[0]));
+  int num_devices =
+      it->second->executable->pjrt_executable().local_devices().size();
+  if (num_devices != 1) {
+    throw std::runtime_error(absl::StrCat(
+        "Running on more than a single device is not currently supported."
+        "The underlying PjRtExecutable has ",
+        num_devices));
+  }
+  it->second->device =
+      it->second->executable->pjrt_executable().local_devices()[0];
+  it->second->out_pytree_def = py::cast<PyTreeDef>(executable_and_pytree[1]);
+
+  py::list shaped_arrays =
+      py::reinterpret_borrow<py::object>(executable_and_pytree[2]);
+  py::list lazy_expressions =
+      py::reinterpret_borrow<py::object>(executable_and_pytree[3]);
+
+  it->second->out_avals.reserve(shaped_arrays.size());
+  it->second->out_lazy_exprs.reserve(lazy_expressions.size());
+
+  int num_outputs = shaped_arrays.size();
+  for (int i = 0; i < num_outputs; ++i) {
+    py::object shaped_array =
+        py::reinterpret_borrow<py::object>(shaped_arrays[i]);
+    py::object lazy_expr =
+        py::reinterpret_borrow<py::object>(lazy_expressions[i]);
+
+    it->second->out_avals.push_back(shaped_array);
+    it->second->out_lazy_exprs.push_back(lazy_expr);
+  }
+
+  return *(it->second);
+}
+
+py::object CompiledFunction::Call(py::args args, py::kwargs kwargs) {
+  ParsedArgumentsAsBuffers arguments;
+  FlattenArguments(args, kwargs, static_argnums_, arguments);
+
+  // The C++ jit do not support Tracers arguments yet. The Python-based jit
+  // function will be called if any of the dynamic arguments is unsupported.
+  if (!ConvertArgsToBuffers(jax_enable_x64_, *pyclient_, default_device_,
+                            arguments)
+           .ok()) {
+    return python_f_jitted_(*args, **kwargs);
+  }
+
+  CacheEntry& cache_entry = GetCacheEntry(args, kwargs, arguments.signature);
+
+  std::vector<std::unique_ptr<xla::PyBuffer>> outputs =
+      ValueOrThrow(cache_entry.executable->PjRtExecute(arguments.arg_buffers));
+
+  static const auto* xla_module =
+      new py::module(py::module::import("jax.interpreters.xla"));
+  const auto& device_array = xla_module->attr("DeviceArray");
+
+  const std::vector<py::object>& out_avals = cache_entry.out_avals;
+  const std::vector<py::object>& out_lazy_exprs = cache_entry.out_lazy_exprs;
+
+  py::list flat_device_arrays;
+  for (int i = 0; i < outputs.size(); ++i) {
+    flat_device_arrays.append(device_array(
+        /*aval=*/out_avals[i], /*device=*/outputs[i]->device(),
+        /*lazy_expr=*/out_lazy_exprs[i],
+        /*device_buffer=*/std::move(outputs[i])));
+  }
+  return cache_entry.out_pytree_def.Unflatten(flat_device_arrays);
+}
+
+}  // namespace
+
+void BuildJaxjitSubmodule(pybind11::module& m) {
+  py::module jitlib = m.def_submodule("jax_jit", "Jax C++ jit library");
+
+  py::class_<CompiledFunction, std::unique_ptr<CompiledFunction>> cfun(
+      jitlib, "CompiledFunction");
+  cfun.def("__call__", &CompiledFunction::Call);
+
+  jitlib.def("jit",
+             [](py::function cache_miss_fun,
+                py::function fallback_on_unsupported_argument,
+                bool jax_enable_x64, std::vector<int> static_argnums,
+                xla::ClientAndPtr<xla::PjRtDevice> client_and_device)
+                 -> std::unique_ptr<CompiledFunction> {
+               return std::make_unique<CompiledFunction>(
+                   std::move(cache_miss_fun),
+                   std::move(fallback_on_unsupported_argument), jax_enable_x64,
+                   std::move(static_argnums), client_and_device.client,
+                   client_and_device.contents);
+             });
+
+  // Only for testing purposes
+  jitlib.def("_ScalarToBuffer", [](py::handle scalar, bool jax_enable_x64,
+                                   std::shared_ptr<xla::PyClient> client) {
+    xla::PjRtClient* pjrt_client = client->pjrt_client();
+
+    return std::make_unique<xla::PyBuffer>(
+        client,
+        ScalarToBuffer(scalar, jax_enable_x64, pjrt_client,
+                       pjrt_client->local_devices()[0])
+            .ValueOrDie(),
+        nullptr);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/jax_jit.h b/tensorflow/compiler/xla/python/jax_jit.h
new file mode 100644
index 00000000000..2b1603aac27
--- /dev/null
+++ b/tensorflow/compiler/xla/python/jax_jit.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
+
+#include "pybind11/pybind11.h"
+
+namespace xla {
+
+void BuildJaxjitSubmodule(pybind11::module& m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
index 7c029ca7d19..f6067e650c0 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -101,14 +101,14 @@ uint32_t constexpr kOutfeedCidShutdown = 0;
 // Encapsulates data received from a device outfeed.
 class OutfeedData {
  public:
-  OutfeedData(Device* device, uint32_t consumer_id, Shape shape)
+  OutfeedData(PjRtDevice* device, uint32_t consumer_id, Shape shape)
       : device_(device),
         consumer_id_(consumer_id),
         shape_(shape),
         literal_(nullptr),
         literal_size_bytes_(0) {}
 
-  Device* device() { return device_; }
+  PjRtDevice* device() { return device_; }
   uint32_t consumer_id() const { return consumer_id_; }
   Shape shape() const { return shape_; }
   std::unique_ptr<Literal> literal() {
@@ -123,7 +123,7 @@ class OutfeedData {
   std::string DebugString() const;
 
  private:
-  Device* device_;
+  PjRtDevice* device_;
   uint32_t consumer_id_;
   Shape shape_;
   std::unique_ptr<Literal> literal_;
@@ -187,8 +187,8 @@ class OutfeedReceiverImpl {
   Status SendShutdownOutfeedHeader(int device_idx);
 
   // Receives a raw Literal from a device outfeed.
-  StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(const Device* device,
-                                                           const Shape& shape);
+  StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(
+      const PjRtDevice* device, const Shape& shape);
 
   // Enqueues received data in the callbaback queue.
   void EnqueueReceivedData(std::unique_ptr<OutfeedData> received)
@@ -200,7 +200,7 @@ class OutfeedReceiverImpl {
 
   OutfeedReceiver::Callback callback_;
   // The devices on which we are listening.
-  std::vector<Device*> devices_;
+  std::vector<PjRtDevice*> devices_;
   // Maximum bytes capacity of the callback queue.
   uint64_t max_callback_queue_size_bytes_;
 
@@ -283,7 +283,7 @@ void OutfeedReceiverImpl::DeviceListenerThreadLoop(int device_idx) {
     absl::MutexLock lock(&mu_);
     ++num_listening_threads_;
   }
-  Device* device = devices_[device_idx];
+  PjRtDevice* device = devices_[device_idx];
   while (true) {
     Shape header_shape = ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords});
     std::unique_ptr<Literal> header =
@@ -339,7 +339,7 @@ void OutfeedReceiverImpl::EnqueueReceivedData(
 }
 
 StatusOr<std::unique_ptr<Literal>> OutfeedReceiverImpl::ReceiveRawFromOutfeed(
-    const Device* device, const Shape& shape) {
+    const PjRtDevice* device, const Shape& shape) {
   std::shared_ptr<Literal> literal_shared;
 
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
@@ -390,7 +390,7 @@ void OutfeedReceiverImpl::CallbackThreadLoop() {
 }
 
 Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
-  const Device* device = devices_[device_idx];
+  const PjRtDevice* device = devices_[device_idx];
   constexpr int consumer_id = kOutfeedCidShutdown;
   VLOG(2) << "[" << device->DebugString()
           << "] SendSpecialHeader cons=" << consumer_id;
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.h b/tensorflow/compiler/xla/python/outfeed_receiver.h
index a8dcc559810..46e2e5d9526 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.h
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.h
@@ -33,7 +33,7 @@ class OutfeedReceiver {
  public:
   // A callback takes: device, consumer id, received.
   using Callback =
-      std::function<void(Device*, uint32_t, std::shared_ptr<Literal>)>;
+      std::function<void(PjRtDevice*, uint32_t, std::shared_ptr<Literal>)>;
 
   // Constructs the receiver for the given clients and callback function.
   //
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
index d297df332ff..a732ab8e21a 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -40,7 +40,7 @@ class OutfeedReceiverForPython {
  public:
   // A callback to Python takes: consumer id, received literal.
   using CallbackToPython =
-      std::function<void(ClientAndPtr<Device>, uint32_t, pybind11::object)>;
+      std::function<void(ClientAndPtr<PjRtDevice>, uint32_t, pybind11::object)>;
 
   OutfeedReceiverForPython(CallbackToPython callback_python,
                            std::vector<std::shared_ptr<PyClient>> clients,
@@ -48,7 +48,7 @@ class OutfeedReceiverForPython {
       : callback_python_(std::move(callback_python)),
         clients_(std::move(clients)) {
     OutfeedReceiver::Callback callback =
-        [this](Device* device, uint32_t consumer_id,
+        [this](PjRtDevice* device, uint32_t consumer_id,
                std::shared_ptr<Literal> literal) {
           this->Callback(device, consumer_id, std::move(literal));
         };
@@ -86,7 +86,7 @@ class OutfeedReceiverForPython {
                                                   arrays);
   }
 
-  void Callback(Device* device, uint32_t consumer_id,
+  void Callback(PjRtDevice* device, uint32_t consumer_id,
                 std::shared_ptr<Literal> literal) {
     {
       absl::MutexLock lock(&mu_);
@@ -106,7 +106,7 @@ class OutfeedReceiverForPython {
         LiteralToPython(std::move(literal)).ValueOrDie();
     // The callback_ should handle all exceptions in user-code. If we get
     // an exception here, it is a bug in the callback and we should stop.
-    callback_python_(WrapWithClient<Device>(*it, device), consumer_id,
+    callback_python_(WrapWithClient<PjRtDevice>(*it, device), consumer_id,
                      std::move(literal_python));
   }
 
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
index e8a5063b70b..919dafe2e0b 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -78,11 +78,11 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -111,11 +111,11 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -156,11 +156,11 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -199,11 +199,11 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
@@ -233,11 +233,11 @@ TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = absl::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback = [&receiver](
-                                           Device* device, uint32_t consumer_id,
-                                           std::shared_ptr<Literal> data) {
-    receiver->Receive(consumer_id, data);
-  };
+  OutfeedReceiver::Callback callback =
+      [&receiver](PjRtDevice* device, uint32_t consumer_id,
+                  std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
   auto outfeed_receiver =
       std::make_shared<OutfeedReceiver>(callback, clients, 128);
   outfeed_receiver->Start();
diff --git a/tensorflow/compiler/xla/python/py_buffer.cc b/tensorflow/compiler/xla/python/py_buffer.cc
index ed4787310b4..b32fe047530 100644
--- a/tensorflow/compiler/xla/python/py_buffer.cc
+++ b/tensorflow/compiler/xla/python/py_buffer.cc
@@ -51,12 +51,12 @@ PyBuffer::~PyBuffer() {
   }
 }
 
-ClientAndPtr<Device> PyBuffer::device() const {
+ClientAndPtr<PjRtDevice> PyBuffer::device() const {
   return WrapWithClient(client_, buffer_->device());
 }
 
 StatusOr<std::unique_ptr<PyBuffer>> PyBuffer::CopyToDevice(
-    const ClientAndPtr<Device>& dst_device) const {
+    const ClientAndPtr<PjRtDevice>& dst_device) const {
   CHECK(dst_device.get() != nullptr);
   GlobalPyRefManager()->CollectGarbage();
   std::unique_ptr<PjRtBuffer> out;
diff --git a/tensorflow/compiler/xla/python/py_buffer.h b/tensorflow/compiler/xla/python/py_buffer.h
index 76791e969cb..d7906574ec1 100644
--- a/tensorflow/compiler/xla/python/py_buffer.h
+++ b/tensorflow/compiler/xla/python/py_buffer.h
@@ -38,12 +38,12 @@ class PyBuffer {
   std::shared_ptr<PyClient> client() const { return client_; }
   PjRtBuffer* buffer() const { return buffer_.get(); }
 
-  ClientAndPtr<Device> device() const;
+  ClientAndPtr<PjRtDevice> device() const;
   const std::string& platform_name() const { return buffer_->platform_name(); }
   bool is_deleted() const { return buffer_->IsDeleted(); }
 
   StatusOr<std::unique_ptr<PyBuffer>> CopyToDevice(
-      const ClientAndPtr<Device>& dst_device) const;
+      const ClientAndPtr<PjRtDevice>& dst_device) const;
 
   void Delete() { return buffer_->Delete(); }
 
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 9b95f8e03de..6df11322564 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -33,8 +33,8 @@ namespace pprof = tensorflow::tfprof::pprof;
 PyClient::PyClient(std::shared_ptr<PjRtClient> pjrt_client)
     : pjrt_client_(std::move(pjrt_client)) {}
 
-std::vector<ClientAndPtr<Device>> PyClient::Devices() {
-  std::vector<ClientAndPtr<Device>> devices;
+std::vector<ClientAndPtr<PjRtDevice>> PyClient::Devices() {
+  std::vector<ClientAndPtr<PjRtDevice>> devices;
   devices.reserve(pjrt_client_->devices().size());
   for (const auto& device : pjrt_client_->devices()) {
     devices.push_back(WrapWithClient(shared_from_this(), device.get()));
@@ -42,21 +42,21 @@ std::vector<ClientAndPtr<Device>> PyClient::Devices() {
   return devices;
 }
 
-std::vector<ClientAndPtr<Device>> PyClient::LocalDevices() {
-  std::vector<ClientAndPtr<Device>> devices;
+std::vector<ClientAndPtr<PjRtDevice>> PyClient::LocalDevices() {
+  std::vector<ClientAndPtr<PjRtDevice>> devices;
   devices.reserve(pjrt_client_->local_devices().size());
-  for (Device* device : pjrt_client_->local_devices()) {
+  for (PjRtDevice* device : pjrt_client_->local_devices()) {
     devices.push_back(WrapWithClient(shared_from_this(), device));
   }
   return devices;
 }
 
-StatusOr<std::vector<std::vector<ClientAndPtr<Device>>>>
+StatusOr<std::vector<std::vector<ClientAndPtr<PjRtDevice>>>>
 PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
   TF_ASSIGN_OR_RETURN(
       DeviceAssignment device_assignment,
       pjrt_client_->GetDefaultDeviceAssignment(num_replicas, num_partitions));
-  std::vector<std::vector<ClientAndPtr<Device>>> result;
+  std::vector<std::vector<ClientAndPtr<PjRtDevice>>> result;
   result.resize(num_replicas);
   for (int r = 0; r < num_replicas; ++r) {
     result[r].resize(num_partitions);
@@ -70,12 +70,12 @@ PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
   return result;
 }
 
-StatusOr<std::vector<ClientAndPtr<Device>>>
+StatusOr<std::vector<ClientAndPtr<PjRtDevice>>>
 PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
   TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                       pjrt_client_->GetDefaultDeviceAssignment(
                           num_replicas, /*num_partitions=*/1));
-  std::vector<ClientAndPtr<Device>> result;
+  std::vector<ClientAndPtr<PjRtDevice>> result;
   for (int i = 0; i < num_replicas; ++i) {
     int device_id = device_assignment(i, 0);
     auto iter = pjrt_client_->id_to_device().find(device_id);
@@ -86,7 +86,7 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
 }
 
 StatusOr<std::unique_ptr<PyBuffer>> PyClient::BufferFromPyval(
-    const pybind11::object& argument, Device* device, bool force_copy,
+    const pybind11::object& argument, PjRtDevice* device, bool force_copy,
     PjRtBuffer::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
     TF_RET_CHECK(!pjrt_client_->local_devices().empty());
@@ -206,7 +206,7 @@ namespace {
 struct HeapProfileKey {
   Traceback* traceback;
   int64 size;
-  Device* device;
+  PjRtDevice* device;
   bool operator==(const HeapProfileKey& other) const;
 };
 
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index e41415c42f2..f12a4ae4f0a 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -100,14 +100,14 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   int device_count() const { return pjrt_client_->device_count(); }
   int host_id() const { return pjrt_client_->host_id(); }
 
-  std::vector<ClientAndPtr<Device>> Devices();
-  std::vector<ClientAndPtr<Device>> LocalDevices();
+  std::vector<ClientAndPtr<PjRtDevice>> Devices();
+  std::vector<ClientAndPtr<PjRtDevice>> LocalDevices();
 
-  StatusOr<std::vector<std::vector<ClientAndPtr<Device>>>>
+  StatusOr<std::vector<std::vector<ClientAndPtr<PjRtDevice>>>>
   GetDefaultDeviceAssignment(int num_replicas, int num_partitions);
 
   // TODO(skye): delete after all callers can handle 2D output
-  StatusOr<std::vector<ClientAndPtr<Device>>> GetDefaultDeviceAssignment1D(
+  StatusOr<std::vector<ClientAndPtr<PjRtDevice>>> GetDefaultDeviceAssignment1D(
       int num_replicas);
 
   StatusOr<ChannelHandle> CreateChannelHandle() {
@@ -121,7 +121,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   }
 
   StatusOr<std::unique_ptr<PyBuffer>> BufferFromPyval(
-      const pybind11::object& argument, Device* device, bool force_copy,
+      const pybind11::object& argument, PjRtDevice* device, bool force_copy,
       PjRtBuffer::HostBufferSemantics host_buffer_semantics);
 
   StatusOr<std::shared_ptr<PyExecutable>> Compile(
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index ed524f1cb33..53891b96846 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -58,10 +58,10 @@ PyExecutable::~PyExecutable() {
   }
 }
 
-std::vector<ClientAndPtr<Device>> PyExecutable::LocalDevices() const {
-  std::vector<ClientAndPtr<Device>> devices;
+std::vector<ClientAndPtr<PjRtDevice>> PyExecutable::LocalDevices() const {
+  std::vector<ClientAndPtr<PjRtDevice>> devices;
   devices.reserve(executable_->local_devices().size());
-  for (Device* device : executable_->local_devices()) {
+  for (PjRtDevice* device : executable_->local_devices()) {
     devices.push_back(WrapWithClient(client_, device));
   }
   return devices;
diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h
index 24f177261e7..2e51548ae51 100644
--- a/tensorflow/compiler/xla/python/py_executable.h
+++ b/tensorflow/compiler/xla/python/py_executable.h
@@ -47,7 +47,7 @@ class PyExecutable {
     return executable_->local_logical_device_ids();
   }
 
-  std::vector<ClientAndPtr<Device>> LocalDevices() const;
+  std::vector<ClientAndPtr<PjRtDevice>> LocalDevices() const;
 
   int64 SizeOfGeneratedCodeInBytes() const {
     return executable_->SizeOfGeneratedCodeInBytes();
diff --git a/tensorflow/compiler/xla/python/pytree.cc b/tensorflow/compiler/xla/python/pytree.cc
index 58d6a585b08..bf0bb1a8d93 100644
--- a/tensorflow/compiler/xla/python/pytree.cc
+++ b/tensorflow/compiler/xla/python/pytree.cc
@@ -107,7 +107,7 @@ bool PyTreeDef::operator==(const PyTreeDef& other) const {
 }
 
 void PyTreeDef::FlattenInto(py::handle handle,
-                            std::vector<py::handle>& leaves) {
+                            std::vector<py::object>& leaves) {
   Node node;
   int start_num_nodes = traversal_.size();
   int start_num_leaves = leaves.size();
@@ -158,23 +158,19 @@ void PyTreeDef::FlattenInto(py::handle handle,
     }
   } else {
     assert(node.kind == Kind::kLeaf);
-    leaves.push_back(handle);
+    leaves.push_back(pybind11::reinterpret_borrow<py::object>(handle));
   }
   node.num_nodes = traversal_.size() - start_num_nodes + 1;
   node.num_leaves = leaves.size() - start_num_leaves;
   traversal_.push_back(std::move(node));
 }
 
-/*static*/ std::pair<py::list, std::unique_ptr<PyTreeDef>> PyTreeDef::Flatten(
-    py::handle x) {
-  std::vector<py::handle> leaves;
+/*static*/ std::pair<std::vector<py::object>, std::unique_ptr<PyTreeDef>>
+PyTreeDef::Flatten(py::handle x) {
+  std::vector<py::object> leaves;
   auto tree = absl::make_unique<PyTreeDef>();
   tree->FlattenInto(x, leaves);
-  py::list outputs(leaves.size());
-  for (int i = 0; i < leaves.size(); ++i) {
-    outputs[i] = py::reinterpret_borrow<py::object>(leaves[i]);
-  }
-  return std::make_pair(std::move(outputs), std::move(tree));
+  return std::make_pair(std::move(leaves), std::move(tree));
 }
 
 /*static*/ bool PyTreeDef::AllLeaves(const py::iterable& x) {
diff --git a/tensorflow/compiler/xla/python/pytree.h b/tensorflow/compiler/xla/python/pytree.h
index 76fd76fad6a..69cd93a7d08 100644
--- a/tensorflow/compiler/xla/python/pytree.h
+++ b/tensorflow/compiler/xla/python/pytree.h
@@ -84,12 +84,12 @@ class PyTreeDef {
   PyTreeDef() = default;
 
   // Flattens a Pytree into a list of leaves and a PyTreeDef.
-  static std::pair<pybind11::list, std::unique_ptr<PyTreeDef>> Flatten(
-      pybind11::handle x);
+  static std::pair<std::vector<pybind11::object>, std::unique_ptr<PyTreeDef>>
+  Flatten(pybind11::handle x);
 
   // Recursive helper used to implement Flatten().
   void FlattenInto(pybind11::handle handle,
-                   std::vector<pybind11::handle>& leaves);
+                   std::vector<pybind11::object>& leaves);
 
   // Tests whether the given list is a flat list of leaves.
   static bool AllLeaves(const pybind11::iterable& x);
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index e78f04ff980..e4fb2cdfd41 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -37,8 +37,8 @@ namespace xla {
 
 TpuDevice::TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
                      int core_on_chip)
-    : xla::Device(id, /*local_device_state=*/nullptr, kTpuPlatform,
-                  /*device_kind=*/"Cloud TPU", host_id),
+    : xla::PjRtDevice(id, /*local_device_state=*/nullptr, kTpuPlatform,
+                      /*device_kind=*/"Cloud TPU", host_id),
       coords_(coords),
       core_on_chip_(core_on_chip) {}
 
@@ -47,9 +47,9 @@ std::string TpuDevice::DebugString() const {
                          coords_[0], coords_[1], coords_[2], core_on_chip_);
 }
 
-xla::StatusOr<std::vector<std::shared_ptr<xla::Device>>>
+xla::StatusOr<std::vector<std::shared_ptr<xla::PjRtDevice>>>
 TpuDevice::GetTpuDevices(const tpu_driver::SystemInfo& system_info) {
-  std::vector<std::shared_ptr<Device>> devices;
+  std::vector<std::shared_ptr<PjRtDevice>> devices;
   for (const auto& chip : system_info.tpu_chip()) {
     auto& coord = chip.chip_coord();
     std::array<int, 3> coords_array = {coord.x(), coord.y(), coord.z()};
@@ -78,7 +78,7 @@ StatusOr<std::shared_ptr<PyTpuClient>> PyTpuClient::Get(
   tpu_driver::SystemInfo system_info;
   client->QuerySystemInfo(&system_info);
 
-  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<Device>> devices,
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<PjRtDevice>> devices,
                       TpuDevice::GetTpuDevices(system_info));
 
   return std::make_shared<PyTpuClient>(kTpuPlatform, std::move(client),
@@ -88,13 +88,13 @@ StatusOr<std::shared_ptr<PyTpuClient>> PyTpuClient::Get(
 
 PyTpuClient::PyTpuClient(std::string platform_name,
                          std::unique_ptr<tpu_driver::TpuDriver> driver,
-                         std::vector<std::shared_ptr<Device>> devices,
+                         std::vector<std::shared_ptr<PjRtDevice>> devices,
                          int host_id)
     : platform_name_(std::move(platform_name)),
       driver_(std::move(driver)),
       devices_(std::move(devices)),
       host_id_(host_id) {
-  for (const std::shared_ptr<Device>& device : devices_) {
+  for (const std::shared_ptr<PjRtDevice>& device : devices_) {
     CHECK(id_to_device_.insert({device->id(), device}).second)
         << "Duplicate device id: " << device->id();
 
@@ -173,7 +173,7 @@ static Status CheckDataType(xla::PrimitiveType dtype) {
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
     std::vector<BorrowingLiteral> leaves, const Shape& tuple_shape,
     std::shared_ptr<void> leaves_references,
-    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::FromLiterals");
   VLOG(1) << "PyTpuBuffer::FromLiterals: shape: " << tuple_shape.DebugString()
           << " device: " << device->DebugString();
@@ -229,7 +229,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
 /* static */
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::MakeTuple(
     absl::Span<PyTpuBuffer* const> buffers, std::shared_ptr<PyTpuClient> client,
-    std::shared_ptr<Device> device) {
+    std::shared_ptr<PjRtDevice> device) {
   std::vector<Shape> child_shapes;
   std::vector<std::shared_ptr<TpuSharedBuffer>> child_device_buffers;
   std::vector<tpu_driver::BufferHandle*> child_handle_ptrs;
@@ -388,7 +388,7 @@ PyTpuBuffer::DestructureTuple() {
 }
 
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CopyToDevice(
-    std::shared_ptr<Device> dst_device) {
+    std::shared_ptr<PjRtDevice> dst_device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::CopyToDevice");
   if (on_host_shape_.IsTuple()) {
     return Unimplemented("CopyToDevice for tuples is not supported.");
@@ -433,7 +433,7 @@ Status PyTpuBuffer::BlockHostUntilReady() {
 /* static */
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
     const Shape& shape, std::shared_ptr<PyTpuClient> client,
-    std::shared_ptr<Device> device) {
+    std::shared_ptr<PjRtDevice> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::AllocateBuffer");
   VLOG(1) << "PyTpuBuffer::AllocateBuffer: shape: " << shape.DebugString()
           << " device: " << device->DebugString();
@@ -465,7 +465,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
 /*static*/
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CreateBuffer(
     const Shape& non_tuple_shape, absl::optional<BufferInitializer> initializer,
-    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::CreateBuffer");
   VLOG(1) << "PyTpuBuffer::CreateBuffer: shape: "
           << non_tuple_shape.DebugString()
@@ -493,8 +493,8 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CreateBuffer(
       std::vector<std::shared_ptr<TpuSharedBuffer>>(), client);
 }
 
-static std::shared_ptr<Device> LookupDevice(const PyTpuClient& client,
-                                            int device_id) {
+static std::shared_ptr<PjRtDevice> LookupDevice(const PyTpuClient& client,
+                                                int device_id) {
   auto it = client.id_to_device().find(device_id);
   CHECK(it != client.id_to_device().end())
       << "Unknown device id: " << device_id;
@@ -516,7 +516,7 @@ PyTpuExecutable::PyTpuExecutable(
   for (int replica = 0; replica < num_replicas; ++replica) {
     for (int partition = 0; partition < num_partitions; ++partition) {
       int device_id = device_assignment_(replica, partition);
-      std::shared_ptr<Device> device = LookupDevice(*client_, device_id);
+      std::shared_ptr<PjRtDevice> device = LookupDevice(*client_, device_id);
       if (device->host_id() != client_->host_id()) {
         VLOG(3) << "Non-local device: " << device_id;
         continue;
@@ -541,7 +541,7 @@ PyTpuExecutable::ExecuteResult PyTpuExecutable::ExecuteHelper(
     absl::Span<PyTpuBuffer* const> this_core_arguments, int replica,
     int partition, const RunId& run_id) {
   const int device_id = device_assignment_(replica, partition);
-  std::shared_ptr<Device> device = LookupDevice(*client_, device_id);
+  std::shared_ptr<PjRtDevice> device = LookupDevice(*client_, device_id);
   CHECK_EQ(device->host_id(), client_->host_id());
   tensorflow::profiler::TraceMe traceme("PyTpuExecutable::Execute");
   VLOG(3) << "Replica " << replica << ", partition " << partition
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
index 4c45df181db..c2a424677fd 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@@ -38,7 +38,7 @@ namespace xla {
 
 constexpr char kTpuPlatform[] = "tpu";
 
-class TpuDevice : public Device {
+class TpuDevice : public PjRtDevice {
  public:
   TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
             int core_on_chip);
@@ -48,8 +48,8 @@ class TpuDevice : public Device {
 
   std::string DebugString() const override;
 
-  static xla::StatusOr<std::vector<std::shared_ptr<xla::Device>>> GetTpuDevices(
-      const tpu_driver::SystemInfo& system_info);
+  static xla::StatusOr<std::vector<std::shared_ptr<xla::PjRtDevice>>>
+  GetTpuDevices(const tpu_driver::SystemInfo& system_info);
 
  private:
   const std::array<int, 3> coords_;
@@ -66,7 +66,7 @@ class PyTpuClient {
 
   explicit PyTpuClient(std::string platform_name,
                        std::unique_ptr<tpu_driver::TpuDriver> driver,
-                       std::vector<std::shared_ptr<Device>> devices,
+                       std::vector<std::shared_ptr<PjRtDevice>> devices,
                        int host_id);
   virtual ~PyTpuClient() = default;
 
@@ -83,11 +83,11 @@ class PyTpuClient {
 
   int device_count() const { return devices_.size(); }
   int local_device_count() const { return local_devices_.size(); }
-  const std::vector<std::shared_ptr<Device>>& devices() { return devices_; }
-  const std::vector<std::shared_ptr<Device>>& local_devices() {
+  const std::vector<std::shared_ptr<PjRtDevice>>& devices() { return devices_; }
+  const std::vector<std::shared_ptr<PjRtDevice>>& local_devices() {
     return local_devices_;
   }
-  const std::map<int, std::shared_ptr<Device>>& id_to_device() const {
+  const std::map<int, std::shared_ptr<PjRtDevice>>& id_to_device() const {
     return id_to_device_;
   }
   int host_id() const { return host_id_; }
@@ -110,11 +110,11 @@ class PyTpuClient {
   std::unique_ptr<tpu_driver::TpuDriver> driver_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
-  std::vector<std::shared_ptr<Device>> devices_;
+  std::vector<std::shared_ptr<PjRtDevice>> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
-  std::map<int, std::shared_ptr<Device>> id_to_device_;
+  std::map<int, std::shared_ptr<PjRtDevice>> id_to_device_;
   // Local devices indexed by local device ordinal.
-  std::vector<std::shared_ptr<Device>> local_devices_;
+  std::vector<std::shared_ptr<PjRtDevice>> local_devices_;
   int host_id_;
 
   // A thread pool for scheduling core executions in parallel.
@@ -128,7 +128,7 @@ struct TpuSharedBuffer final {
   TpuSharedBuffer(tpu_driver::TpuDriver* driver,
                   std::unique_ptr<tpu_driver::BufferHandle> handle,
                   std::vector<std::shared_ptr<tpu_driver::Event>> wait_for_use,
-                  std::shared_ptr<Device> src_device)
+                  std::shared_ptr<PjRtDevice> src_device)
       : driver(driver),
         device(std::move(src_device)),
         handle(std::move(handle)),
@@ -143,7 +143,7 @@ struct TpuSharedBuffer final {
   }
 
   tpu_driver::TpuDriver* const driver;
-  const std::shared_ptr<Device> device;
+  const std::shared_ptr<PjRtDevice> device;
 
   std::unique_ptr<tpu_driver::BufferHandle> handle;
   std::vector<std::shared_ptr<tpu_driver::Event>> wait_for_use;
@@ -162,12 +162,12 @@ class PyTpuBuffer {
   static StatusOr<std::unique_ptr<PyTpuBuffer>> FromLiterals(
       std::vector<BorrowingLiteral> leaves_literals, const Shape& tuple_shape,
       std::shared_ptr<void> leaves_reference,
-      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device);
 
   // Supports nested tuple creation.
   static StatusOr<std::unique_ptr<PyTpuBuffer>> MakeTuple(
       absl::Span<PyTpuBuffer* const> buffers,
-      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device);
 
   PyTpuBuffer() = delete;
   PyTpuBuffer(Shape on_host_shape,
@@ -181,7 +181,7 @@ class PyTpuBuffer {
   PyTpuBuffer& operator=(PyTpuBuffer&&) = delete;
 
   const Shape& on_host_shape() const { return on_host_shape_; }
-  std::shared_ptr<Device> device() const { return device_; }
+  std::shared_ptr<PjRtDevice> device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
   std::shared_ptr<PyTpuClient> client() const { return client_; }
 
@@ -210,7 +210,7 @@ class PyTpuBuffer {
   // Copies the buffer to target device `dst_device` and returns a PyTpuBuffer
   // object holding the context to the target device buffer.
   StatusOr<std::unique_ptr<PyTpuBuffer>> CopyToDevice(
-      std::shared_ptr<Device> dst_device);
+      std::shared_ptr<PjRtDevice> dst_device);
 
   // Blocks the host until the buffer's value has been computed and is ready for
   // immediate use on the device. Useful in particular for timing benchmarks.
@@ -220,7 +220,7 @@ class PyTpuBuffer {
   // tuple, the returned buffer corresponds to the root tuple buffer.
   static StatusOr<std::unique_ptr<PyTpuBuffer>> AllocateBuffer(
       const Shape& shape, std::shared_ptr<PyTpuClient> client,
-      std::shared_ptr<Device> device);
+      std::shared_ptr<PjRtDevice> device);
 
  private:
   // Initializes a just allocated device buffer. The returned event will be
@@ -231,11 +231,11 @@ class PyTpuBuffer {
   static StatusOr<std::unique_ptr<PyTpuBuffer>> CreateBuffer(
       const Shape& non_tuple_shape,
       absl::optional<BufferInitializer> initializer,
-      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<PjRtDevice> device);
 
   const std::shared_ptr<PyTpuClient> client_;
   const Shape on_host_shape_;
-  const std::shared_ptr<Device> device_;
+  const std::shared_ptr<PjRtDevice> device_;
 
   // If this is a tuple, `device_buffer_` stores the tuple buffer and
   // `child_buffers_` stores the child buffers; else, `device_buffer_` stores
@@ -302,7 +302,7 @@ class PyTpuExecutable {
     return local_logical_device_ids_;
   }
 
-  const std::vector<std::shared_ptr<Device>>& local_devices() const {
+  const std::vector<std::shared_ptr<PjRtDevice>>& local_devices() const {
     return local_devices_;
   }
 
@@ -350,7 +350,7 @@ class PyTpuExecutable {
   // assigned.
   // shared_ptrs instead of unique_ptrs to play well with the Python bindings
   // (see xla.cc).
-  std::vector<std::shared_ptr<Device>> local_devices_;
+  std::vector<std::shared_ptr<PjRtDevice>> local_devices_;
 
   xla::Shape result_shape_;
 };
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 9a794b79c5c..5d526b51899 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -40,11 +40,12 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def("host_id", &PyTpuClient::host_id)
       .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas, int num_partitions)
-               -> StatusOr<std::vector<std::vector<std::shared_ptr<Device>>>> {
+               -> StatusOr<
+                   std::vector<std::vector<std::shared_ptr<PjRtDevice>>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
                                      num_replicas, num_partitions));
-             std::vector<std::vector<std::shared_ptr<Device>>> result;
+             std::vector<std::vector<std::shared_ptr<PjRtDevice>>> result;
              result.resize(num_replicas);
              for (int r = 0; r < num_replicas; ++r) {
                result[r].resize(num_partitions);
@@ -60,11 +61,11 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       // TODO(skye): delete after all callers can handle 2D output
       .def("get_default_device_assignment",
            [](PyTpuClient* client, int num_replicas)
-               -> StatusOr<std::vector<std::shared_ptr<Device>>> {
+               -> StatusOr<std::vector<std::shared_ptr<PjRtDevice>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
                                      num_replicas, /*num_partitions=*/1));
-             std::vector<std::shared_ptr<Device>> result;
+             std::vector<std::shared_ptr<PjRtDevice>> result;
              for (int i = 0; i < num_replicas; ++i) {
                int device_id = device_assignment(i, 0);
                auto iter = client->id_to_device().find(device_id);
@@ -96,7 +97,8 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def(
           "buffer_from_pyval",
           [](std::shared_ptr<PyTpuClient> client,
-             const pybind11::object& argument, std::shared_ptr<Device> device,
+             const pybind11::object& argument,
+             std::shared_ptr<PjRtDevice> device,
              bool force_copy) -> StatusOr<std::unique_ptr<PyTpuBuffer>> {
             if (device == nullptr) {
               TF_RET_CHECK(!client->local_devices().empty());
@@ -145,7 +147,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
   py::class_<PyTpuBuffer>(m, "PyTpuBuffer")
       .def_property_readonly("client", &PyTpuBuffer::client)
       .def("copy_to_device",
-           [](PyTpuBuffer* buffer, std::shared_ptr<Device> dst_device) {
+           [](PyTpuBuffer* buffer, std::shared_ptr<PjRtDevice> dst_device) {
              CHECK(dst_device != nullptr);
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
@@ -202,7 +204,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def_property_readonly("traceback",
                              [](PyTpuExecutable*) { return py::none(); });
 
-  py::class_<TpuDevice, Device, std::shared_ptr<TpuDevice>>(m, "TpuDevice")
+  py::class_<TpuDevice, PjRtDevice, std::shared_ptr<TpuDevice>>(m, "TpuDevice")
       .def_property_readonly("coords", &TpuDevice::coords)
       .def_property_readonly("core_on_chip", &TpuDevice::core_on_chip)
       .def("__repr__", [](const TpuDevice& device) {
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index e3bbc49f85c..d5977f4f0cf 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
+#include "tensorflow/compiler/xla/python/jax_jit.h"
 #include "tensorflow/compiler/xla/python/ops.h"
 #include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
@@ -438,26 +439,26 @@ PYBIND11_MODULE(xla_extension, m) {
                 device_assignment);
           });
 
-  py::class_<Device, ClientAndPtr<Device>>(
+  py::class_<PjRtDevice, ClientAndPtr<PjRtDevice>>(
       m, "Device",
       "A descriptor of an available device.\n\nSubclasses are used to "
       "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
       "have additional properties specific to that device type.")
       .def_property_readonly(
-          "id", &Device::id,
+          "id", &PjRtDevice::id,
           "Integer ID of this device.\n\nUnique across all available devices "
           "of this type, including remote devices on multi-host platforms.")
-      .def_property_readonly("host_id", &Device::host_id,
+      .def_property_readonly("host_id", &PjRtDevice::host_id,
                              "Integer ID of this device's host.\n\n"
                              "This is always 0 except on multi-host platforms.")
-      .def_property_readonly("platform", &Device::platform_name)
-      .def_property_readonly("device_kind", &Device::device_kind)
+      .def_property_readonly("platform", &PjRtDevice::platform_name)
+      .def_property_readonly("device_kind", &PjRtDevice::device_kind)
       .def_property_readonly(
           "client",
-          [](const ClientAndPtr<Device>& device) { return device.client; })
-      .def("__str__", &Device::DebugString)
+          [](const ClientAndPtr<PjRtDevice>& device) { return device.client; })
+      .def("__str__", &PjRtDevice::DebugString)
       .def("transfer_to_infeed",
-           [](const Device& device, const LiteralSlice& literal) {
+           [](const PjRtDevice& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
              TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
@@ -467,7 +468,8 @@ PYBIND11_MODULE(xla_extension, m) {
            })
       .def(
           "transfer_from_outfeed",
-          [](const Device& device, const Shape& shape) -> StatusOr<py::object> {
+          [](const PjRtDevice& device,
+             const Shape& shape) -> StatusOr<py::object> {
             GlobalPyRefManager()->CollectGarbage();
             std::shared_ptr<Literal> literal_shared;
             {
@@ -491,12 +493,12 @@ PYBIND11_MODULE(xla_extension, m) {
             return LiteralToPython(std::move(literal_shared));
           });
 
-  py::class_<CpuDevice, Device, ClientAndPtr<CpuDevice>>(m, "CpuDevice")
+  py::class_<CpuDevice, PjRtDevice, ClientAndPtr<CpuDevice>>(m, "CpuDevice")
       .def("__repr__", [](const CpuDevice& device) {
         return absl::StrFormat("CpuDevice(id=%i)", device.id());
       });
 
-  py::class_<GpuDevice, Device, ClientAndPtr<GpuDevice>>(m, "GpuDevice")
+  py::class_<GpuDevice, PjRtDevice, ClientAndPtr<GpuDevice>>(m, "GpuDevice")
       .def("__repr__", [](const GpuDevice& device) {
         return absl::StrFormat("GpuDevice(id=%i)", device.id());
       });
@@ -738,7 +740,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .def(py::init([](const py::bytes& serialized_hlo_module_proto)
                         -> std::unique_ptr<XlaComputation> {
         HloModuleProto proto;
-        proto.ParseFromString(serialized_hlo_module_proto);
+        proto.ParseFromString(std::string(serialized_hlo_module_proto));
         return absl::make_unique<XlaComputation>(proto);
       }))
       .def("get_hlo_module", &GetHloModule)
@@ -899,6 +901,7 @@ PYBIND11_MODULE(xla_extension, m) {
   BuildProfilerSubmodule(&m);
   BuildOutfeedReceiverSubmodule(&m);
   BuildPytreeSubmodule(m);
+  BuildJaxjitSubmodule(m);
 
   py::class_<DistributedRuntimeService,
              std::unique_ptr<DistributedRuntimeService>>
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index f5618b95c3e..dd16bd32dd1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1431,6 +1431,7 @@ cc_library(
         ":hlo_live_range",
         ":hlo_ordering",
         ":hlo_proto_cc",
+        ":memory_space_assignment_repacking",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -1842,6 +1843,7 @@ cc_library(
         ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
@@ -2684,6 +2686,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_dce",
         ":hlo_pass",
+        ":shape_inference",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -2707,7 +2710,6 @@ xla_test(
         ":dynamic_padder",
         ":hlo",
         ":hlo_dce",
-        ":hlo_get_dimension_size_rewriter",
         ":hlo_matchers",
         ":hlo_parser",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -3435,6 +3437,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "memory_space_assignment_best_fit_repacker",
+    srcs = ["memory_space_assignment_best_fit_repacker.cc"],
+    hdrs = ["memory_space_assignment_best_fit_repacker.h"],
+    deps = [
+        ":heap_simulator",
+        ":memory_space_assignment_repacking",
+    ],
+)
+
+tf_cc_test(
+    name = "memory_space_assignment_best_fit_repacker_test",
+    srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
+    deps = [
+        ":memory_space_assignment_best_fit_repacker",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
@@ -3997,42 +4019,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_get_dimension_size_rewriter",
-    srcs = ["hlo_get_dimension_size_rewriter.cc"],
-    hdrs = ["hlo_get_dimension_size_rewriter.h"],
-    deps = [
-        ":dynamic_dimension_inference",
-        ":hlo",
-        ":hlo_pass",
-        ":shape_inference",
-        "//tensorflow/compiler/xla:literal_util",
-        "@com_google_absl//absl/algorithm:container",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_get_dimension_size_rewriter_test",
-    srcs = ["hlo_get_dimension_size_rewriter_test.cc"],
-    deps = [
-        ":hlo",
-        ":hlo_get_dimension_size_rewriter",
-        ":hlo_matchers",
-        ":hlo_parser",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "maybe_owning_device_memory",
     srcs = [
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index fa4d0e47a5d..214cbfa93a7 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -2500,6 +2500,20 @@ Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   if (ShapeUtil::IsZeroElementArray(operand_shape)) {
     return ReplaceInstruction(gather, MakeScalarLike(gather, 0));
   }
+
+  // Gathering from a scalar operand is simply a broadcast of that scalar
+  if (ShapeUtil::IsEffectiveScalar(operand_shape)) {
+    HloInstruction* new_operand = gather->mutable_operand(0);
+    if (operand_shape.rank()) {
+      TF_ASSIGN_OR_RETURN(new_operand,
+                          MakeReshapeHlo(ShapeUtil::MakeScalarShape(
+                                             operand_shape.element_type()),
+                                         new_operand));
+    }
+    HloInstruction* new_gather =
+        MakeBroadcastHlo(new_operand, {}, gather->shape());
+    return ReplaceInstruction(gather, new_gather);
+  }
   // If the operand of a gather is very small, it is easier to fuse a
   // sequence of selects.
   const Shape& index_shape = gather->operand(1)->shape();
@@ -2712,7 +2726,7 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     // Mul(Mul(x, constant1), Mul(y, constant2)) => Mul(Mul(x, y),
     // constant1*constant2)
     if (Match(multiply,
-              m::Multiply(
+              m::MultiplyAnyOrder(
                   m::MultiplyAnyOrder(m::NonConstant(&a), m::Constant(&c1)),
                   m::MultiplyAnyOrder(m::NonConstant(&b), m::Constant(&c2))))) {
       TF_ASSIGN_OR_RETURN(auto* product_of_constants,
@@ -2734,6 +2748,29 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     }
   }
 
+  {
+    HloInstruction *a, *c1, *c2;
+    // Mul(Mul(a, constant1), constant2) => Mul(a, constant1*constant2)
+    if (Match(multiply,
+              m::MultiplyAnyOrder(
+                  m::MultiplyAnyOrder(m::NonConstant(&a), m::Constant(&c1)),
+                  m::Constant(&c2)))) {
+      TF_ASSIGN_OR_RETURN(auto* product_of_constants,
+                          MakeBinaryHlo(HloOpcode::kMultiply, c1, c2));
+      if (ShapeUtil::IsScalar(product_of_constants->shape()) &&
+          !ShapeUtil::IsScalar(multiply->shape())) {
+        product_of_constants =
+            computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                multiply->shape(), product_of_constants, {}));
+      }
+
+      return ReplaceWithNewInstruction(
+          multiply,
+          HloInstruction::CreateBinary(multiply->shape(), HloOpcode::kMultiply,
+                                       a, product_of_constants));
+    }
+  }
+
   {
     HloInstruction *a, *b, *constant, *op;
     // Mul(Mul(a, constant1), Broadcast(b)) =>
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 95700b2a994..70147f6ecad 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -140,6 +140,26 @@ TEST_F(AlgebraicSimplifierTest, MultiplyChain) {
           m::MultiplyAnyOrder(m::ConstantScalar(2), m::ConstantScalar(4)))));
 }
 
+// (a*C1)*C2 => a*(C1*C2)
+TEST_F(AlgebraicSimplifierTest, MultiplyChain2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      a = f32[] constant(2)
+      b = f32[] constant(4)
+      c = f32[] multiply(p0, a)
+      ROOT y = f32[] multiply(c, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::Parameter(0), m::MultiplyAnyOrder(m::ConstantScalar(2),
+                                                       m::ConstantScalar(4)))));
+}
+
 // MUL(MUL(X, BROADCAST(constant)), BROADCAST(Y)) ==>
 // MUL(X, BROADCAST(MUL(Y, BROADCAST(constant))))
 TEST_F(AlgebraicSimplifierTest, MultiplyBroadcastReassoc) {
@@ -5627,6 +5647,30 @@ INSTANTIATE_TEST_SUITE_P(
     DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
     ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
 
+TEST_F(AlgebraicSimplifierTest, GatherOfScalarToBroadcast) {
+  const char* hlo_string = R"(
+  HloModule repeat
+
+  ENTRY main {
+    o = f32[1,1] parameter(0)
+    i = s32[100,2] parameter(1)
+    ROOT g = f32[100] gather(o, i), collapsed_slice_dims={0,1},
+                                  start_index_map={0,1},
+                                  index_vector_dim=1,
+                                  offset_dims={},
+                                  slice_sizes={1,1}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
+}
+
 TEST_F(AlgebraicSimplifierTest, TupleReduceReshape) {
   const char* hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.cc b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
index 9d8f03c92ca..5fb4935a4b1 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
@@ -268,6 +268,11 @@ StatusOr<bool> AllReduceCombiner::Run(HloModule* module) {
   VLOG(1) << "Running AllReduceCombiner with threshold of "
           << combine_threshold_in_bytes_ << " bytes";
 
+  if (combine_threshold_in_bytes_ <= 0 || combine_threshold_count_ <= 0) {
+    VLOG(1) << "Skip AllReduceCombiner because the threshold is zero";
+    return false;
+  }
+
   if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
     VLOG(1) << "Skip AllReduceCombiner because the module contains all-reduce "
                "with constrained layouts";
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 6cd58b86f0c..a0989d5765e 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1424,13 +1424,16 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   // Returns a heap algorithm that chooses the best result from several
   // algorithms.
   auto get_heap_algorithm = [&](int64 alignment) {
-    auto algorithms =
-        absl::make_unique<std::vector<std::unique_ptr<HeapAlgorithm>>>();
-    algorithms->push_back(absl::make_unique<GlobalDecreasingSizeBestFitHeap>(
-        alignment, GlobalDecreasingSizeBestFitHeap::kSpatial));
-    algorithms->push_back(absl::make_unique<GlobalDecreasingSizeBestFitHeap>(
-        alignment, GlobalDecreasingSizeBestFitHeap::kTemporal));
-    return absl::make_unique<ChooseBestHeapAlgorithm>(std::move(algorithms));
+    auto algorithms = absl::make_unique<
+        std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
+    algorithms->push_back(
+        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
+            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
+    algorithms->push_back(
+        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
+            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
+    return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
+        std::move(algorithms));
   };
 
   if (run_whole_module_heap_simulation) {
@@ -1461,7 +1464,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       options.buffers_to_assign = &single_colored_set.second;
 
       TF_ASSIGN_OR_RETURN(
-          HeapSimulator::Result result,
+          HeapSimulator::Result<HloValue> result,
           HeapSimulator::Run(
               get_heap_algorithm(alignment), assignment->module(), schedule,
               assignment->alias_analysis(), assignment->buffer_size_, options));
@@ -1487,7 +1490,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         HeapSimulator::Options options;
         options.buffers_to_assign = &single_colored_set.second;
         TF_ASSIGN_OR_RETURN(
-            HeapSimulator::Result result,
+            HeapSimulator::Result<HloValue> result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
                                *instruction_sequence,
                                assignment->alias_analysis(),
@@ -1582,7 +1585,7 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
 }  // namespace
 
 void BufferAssigner::AssignBuffersFromHeapSimulator(
-    const HeapSimulator::Result& result, BufferAssignment* assignment,
+    const HeapSimulator::Result<HloValue>& result, BufferAssignment* assignment,
     BufferValue::Color color) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 50a4750601b..60422965832 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -661,9 +661,9 @@ class BufferAssigner {
 
   // Uses the results of the heap simulator to create a single allocation, with
   // LogicalBuffers packed to specific offsets.
-  void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
-                                      BufferAssignment* assignment,
-                                      LogicalBuffer::Color color);
+  void AssignBuffersFromHeapSimulator(
+      const HeapSimulator::Result<HloValue>& result,
+      BufferAssignment* assignment, LogicalBuffer::Color color);
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index cdda0aeb925..ce80b4cfc15 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -100,7 +100,7 @@ class BoundaryVisitor {
 // of reuses This is used as a placeholder only, assuming all
 // instructions can be fused to enable data reuses
 int64 ReusesCarriedBy(HloInstruction* op, HloInstruction* user) {
-  VLOG(1) << "ConditionalCodeMotion: Add reuses carried by instr: "
+  VLOG(2) << "ConditionalCodeMotion: Add reuses carried by instr: "
           << op->ToString() << "=>" << user->ToString() << "\n";
   switch (user->opcode()) {
     case HloOpcode::kGetTupleElement:
@@ -432,7 +432,8 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   if (to_move_out.empty()) {
     return false;
   }
-  VLOG(1) << "number of boundaries to move out:" << to_move_out.size() << "\n";
+  VLOG(1) << "Modifying code--number of boundaries to move out:"
+          << to_move_out.size() << "\n";
   HloComputation* conditional_parent = conditional->parent();
   // save the old users before add new conditional user instructions
   std::vector<HloInstruction*> old_conditional_users = conditional->users();
@@ -441,7 +442,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
   // Insert GetTupleElement before the instructions whose operands might still
   // be within the conditional.
-  VLOG(2) << "before opt:"
+  VLOG(1) << "before opt:"
           << conditional_parent->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   int64 op_index = 0;
@@ -470,16 +471,22 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   HloInstruction* old_root =
       conditional->branch_computation(0)->root_instruction();
   for (auto user_instr : old_conditional_users) {
+    VLOG(2) << "Checking conditional user: " << user_instr->ToString() << "\n";
     CHECK(user_instr->opcode() == HloOpcode::kGetTupleElement);
     auto tuple_opd = static_cast<HloGetTupleElementInstruction*>(user_instr);
     int64 index = tuple_opd->tuple_index();
+    CHECK(old_root->operands().size() > index);
     HloInstruction* old_opd = old_root->operands()[index];
+    CHECK(ContainsKey(hoisted_instructions, old_opd));
     HloInstruction* new_opd = hoisted_instructions[old_opd].operands()[0];
     CHECK(old_opd != nullptr);
     CHECK(new_opd != nullptr);
+    VLOG(2) << "Try replace all uses of :" << old_opd->ToString() << "\n";
     TF_RETURN_IF_ERROR(user_instr->ReplaceAllUsesWith(new_opd));
     TF_RETURN_IF_ERROR(conditional_parent->RemoveInstruction(user_instr));
   }
+  VLOG(2) << "Done changing conditional users\n"
+          << conditional_parent->ToString() << "\n";
   // Create tuple element within each branch and set it as root.
   int64 branch_count = conditional->branch_count();
   for (int i = 0; i < branch_count; i++) {
@@ -487,9 +494,8 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
     std::vector<HloInstruction*> elements;
     for (auto b1 : new_boundaries) {
       HloInstruction* op = b1.operands()[i];
-      VLOG(1) << "branch count=" << i << "\n";
       CHECK(op != nullptr);
-      VLOG(1) << "Adding to root " << i << " with " << op->ToString() << "\n";
+      VLOG(2) << "Adding to root " << i << " with " << op->ToString() << "\n";
       elements.push_back(op);
     }
     HloInstruction* tuple =
@@ -507,7 +513,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
       conditional->branch_computation(0)->root_instruction();
   *conditional->mutable_shape() = new_root->shape();
   //
-  VLOG(2) << "done moving instructions out of branches\n"
+  VLOG(1) << "done moving instructions out of branches\n"
           << conditional_parent->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   return true;
@@ -520,48 +526,79 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
   if (to_move_in.empty()) {
     return false;
   }
-  VLOG(1) << "number of boundaries to move in:" << to_move_in.size() << "\n";
-  HloComputation* conditional_parent = conditional->parent();
-  VLOG(2) << "before opt:"
-          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+  VLOG(1) << "Modifying code---number of boundaries to move in:"
+          << to_move_in.size() << "\n";
+  VLOG(1) << "before opt:"
+          << conditional->parent()->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   // Mapping instructions to be moved to their new representations.
   absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
   int64 to_move_in_size = to_move_in.size();
   int64 branch_count = conditional->branch_count();
-  int64 op_index = conditional->shape().tuple_shapes_size();
-  // Map conditional to its old root, then create a new root instruction in each
-  // branch.
-  Boundary b(Boundary::Position::kInsideBranch);
+  // Number of old conditional entries still to be used outside.
+  // If conditional shape is not tuple, will create a tuple and use subscript
+  // 0 to save the old operand being used.
+  int64 op_index = conditional->shape().IsTuple()
+                       ? conditional->shape().tuple_shapes_size() - 1
+                       : 0;
+  HloGetTupleElementInstruction* tuple_use =
+      dynamic_cast<HloGetTupleElementInstruction*>(to_move_in[0].operands()[0]);
+  int64 use_index = (tuple_use != nullptr) ? tuple_use->tuple_index() : -1;
+  VLOG(2) << "Tuple use index = " << use_index << "\n";
+  // Use to map the tuple_use instruction to its operand;
+  Boundary b_opd_use(Boundary::Position::kInsideBranch);
+  Boundary b_old_root(Boundary::Position::kInsideBranch);
+  // Create a new root instruction in each branch.
   for (int i = 0; i < branch_count; i++) {
     auto computation = conditional->branch_computation(i);
     auto old_root = computation->root_instruction();
-    b.mutable_operands().push_back(old_root);
-    HloInstruction* new_root = nullptr;
+    b_old_root.mutable_operands().push_back(old_root);
+    std::vector<HloInstruction*> operands;
     if (old_root->opcode() == HloOpcode::kTuple) {
-      new_root = computation->AddInstruction(old_root->Clone());
-    } else {
-      std::vector<HloInstruction*> operands;
-      if (!old_root->shape().IsTuple()) {
-        operands.push_back(old_root);
-      } else {
-        const Shape& old_shape = old_root->shape();
-        for (int64 i = 0; i < old_shape.tuple_shapes_size(); ++i) {
-          auto element =
-              computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-                  old_shape.tuple_shapes(i), old_root, i));
-          operands.push_back(element);
+      // Use operands of old_root directly, so old_root can be removed later.
+      for (int i = 0; i < old_root->operand_count(); ++i) {
+        if (i != use_index) {
+          operands.push_back(old_root->operands()[i]);
+        } else {  // Map conditional use to the tuple operand.
+          b_opd_use.mutable_operands().push_back(old_root->operands()[i]);
         }
       }
-      new_root =
-          computation->AddInstruction(HloInstruction::CreateTuple(operands));
+    } else if (old_root->shape().IsTuple()) {
+      // If old_root is not a kTuple but has tuple shape, elements within the
+      // tuple must be extracted first to be used by the new instructions.
+      const Shape& old_shape = old_root->shape();
+      for (int64 i = 0; i < old_shape.tuple_shapes_size(); ++i) {
+        auto element =
+            computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                old_shape.tuple_shapes(i), old_root, i));
+        if (i != use_index) {
+          operands.push_back(element);
+        } else {
+          b_opd_use.mutable_operands().push_back(element);
+        }
+      }
+    } else {
+      // If old_root is not a tuple and does not have tuple shape, use it
+      // to replace the conditional directly in the new computation.
+      b_opd_use.mutable_operands().push_back(conditional);
     }
+    HloInstruction* new_root =
+        computation->AddInstruction(HloInstruction::CreateTuple(operands));
     VLOG(2) << "setting new root: " << new_root->ToString() << "\n";
-    computation->set_root_instruction(new_root);
+    computation->set_root_instruction(new_root,
+                                      /*accept_different_shape*/ true);
+    if (old_root->opcode() == HloOpcode::kTuple) {
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(old_root));
+    }
     VLOG(2) << "new branch computation: " << computation->ToString() << "\n";
   }
-  hoisted_instructions[conditional] = b;
-  for (int64 i = 0; i < to_move_in_size; i++) {
+  hoisted_instructions[conditional] = b_old_root;
+  int64 cp_start = 0;
+  if (use_index >= 0) {
+    hoisted_instructions[tuple_use] = b_opd_use;
+    cp_start = 1;
+  }
+  for (int64 i = cp_start; i < to_move_in_size; i++) {
     Boundary b_to_move = to_move_in[i];
     HloInstruction* op = b_to_move.operands()[0];
     CHECK(op != nullptr);
@@ -591,12 +628,12 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
     }
     if (to_be_used_outside) {
       // Modify uses of instructions outside of the conditionals
-      HloInstruction* gtr = conditional_parent->AddInstruction(
+      HloInstruction* gtr = conditional->parent()->AddInstruction(
           HloInstruction::CreateGetTupleElement(op->shape(), conditional,
                                                 op_index++));
       TF_RETURN_IF_ERROR(op->ReplaceAllUsesWith(gtr));
-      if (conditional_parent->root_instruction() == op) {
-        conditional_parent->set_root_instruction(gtr);
+      if (conditional->parent()->root_instruction() == op) {
+        conditional->parent()->set_root_instruction(gtr);
       }
     }
   }
@@ -606,8 +643,8 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
   HloInstruction* new_root =
       conditional->branch_computation(0)->root_instruction();
   *conditional->mutable_shape() = new_root->shape();
-  VLOG(2) << "Before removing instructions:" << conditional_parent->ToString()
-          << "\n";
+  VLOG(2) << "Before removing instructions:"
+          << conditional->parent()->ToString() << "\n";
   // Remove hoisted instructions from the branches.
   for (int64 i = to_move_in_size - 1; i >= 0; i--) {
     Boundary boundary_to_move_in = to_move_in[i];
@@ -616,10 +653,10 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
     for (auto user : op->users()) {
       VLOG(2) << "Has User: " << user->ToString() << "\n";
     }
-    TF_RETURN_IF_ERROR(conditional_parent->RemoveInstruction(op));
+    TF_RETURN_IF_ERROR(conditional->parent()->RemoveInstruction(op));
   }
-  VLOG(2) << "Done moving instructions inside branches\n"
-          << conditional_parent->ToString(HloPrintOptions::Fingerprint())
+  VLOG(1) << "Done moving instructions inside branches\n"
+          << conditional->parent()->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   return true;
 }
@@ -631,6 +668,7 @@ class GroupConnectedBoundaries {
   HloInstruction* conditional_;
   HloComputation* conditional_parent_;
   bool is_layout_sensitive_;
+  // Instructions that have been visited but are not going to be moved.
   absl::flat_hash_set<HloInstruction*> visited_;
 
  public:
@@ -663,7 +701,7 @@ class GroupConnectedBoundaries {
           case HloOpcode::kReshape:
             return true;
           default:
-            VLOG(1) << "Instruction is convert and its operand is not know to "
+            VLOG(2) << "Instruction is convert and its operand is not know to "
                        "be worth hoisting\n";
             return false;
         }
@@ -680,24 +718,28 @@ class GroupConnectedBoundaries {
       case HloOpcode::kGetTupleElement:
         return true;
       default:
-        VLOG(1) << "Instruction is not known to be worth hoisting\n";
+        VLOG(2) << "Instruction is not known to be worth hoisting\n";
         return false;
     }
   }
   int64 ReusesBeforeBoundary(HloInstruction* user) {
     int64 reuses = 0;
     for (auto op : user->operands()) {
+      // The operand must be an instruction that is not going to be moved (if
+      // user is inside the conditional); otherwise it must be the conditional
+      // itself and its user must be outside of the conditional.
+      if (!ContainsKey(visited_, op) && op != conditional_) {
+        continue;
+      }
       // Only consider single-user cases as reuseable.
-      if (ContainsKey(visited_, op) && op->user_count() == 1) {
+      if (user->opcode() == HloOpcode::kGetTupleElement &&
+          user->user_count() == 1) {
+        reuses += ReusesCarriedBy(op, user->users()[0]);
+      } else if (op->user_count() == 1) {
         reuses += ReusesCarriedBy(op, user);
-      } else if (op->opcode() == HloOpcode::kConditional &&
-                 user->opcode() == HloOpcode::kGetTupleElement) {
-        if (user->user_count() == 1) {
-          reuses += ReusesCarriedBy(op, user->users()[0]);
-        }
       }
     }
-    VLOG(1) << "Reuses before instruction " << user->ToString() << ":" << reuses
+    VLOG(2) << "Reuses before instruction " << user->ToString() << ":" << reuses
             << "\n";
     return reuses;
   }
@@ -735,7 +777,7 @@ class GroupConnectedBoundaries {
       } else if (ContainsKey(visited_, op)) {
         reuses += ReusesCarriedBy(user, op);
       }
-      VLOG(1) << "reuses after instruction " << user->ToString() << ":"
+      VLOG(2) << "reuses after instruction " << user->ToString() << ":"
               << reuses << "\n";
       return reuses;
     }
@@ -744,7 +786,8 @@ class GroupConnectedBoundaries {
 
   int64 BenefitForMovingBoundaries(const std::vector<Boundary>& boundaries) {
     int64 reuses_before = 0, reuses_after = 0;
-    if (boundaries.size() == 1 && boundaries[0].IsOutsideBranch()) {
+    if (boundaries.size() == 1 && boundaries[0].IsOutsideBranch() &&
+        boundaries[0].operands()[0]->opcode() == HloOpcode::kGetTupleElement) {
       // The only boundary of moving-in is the get_tuple_element op.
       return -1;
     }
@@ -754,16 +797,16 @@ class GroupConnectedBoundaries {
         continue;
       }
       reuses_before += ReusesBeforeBoundary(op);
-      VLOG(1) << "Reuses before boundary so far: " << reuses_before << "\n";
+      VLOG(2) << "Reuses before boundary so far: " << reuses_before << "\n";
       reuses_after += ReusesAfterBoundary(op);
-      VLOG(1) << "Reuese after boundary so far : " << reuses_after << "\n";
+      VLOG(2) << "Reuese after boundary so far : " << reuses_after << "\n";
     }
     if (reuses_after == 0 && reuses_before == 0) {
       return -1;
     } else if (boundaries[0].IsInsideBranch()) {
       return reuses_after - reuses_before;
     } else {
-      return reuses_before - reuses_after;
+      return reuses_before - reuses_after - 1;
     }
   }
 
@@ -800,12 +843,12 @@ class GroupConnectedBoundaries {
     visitor.AddToWorkList(boundary);
     while (visitor.HasNextBoundary()) {
       Boundary b = visitor.PopNextBoundary();
-      VLOG(1) << "visiting boundary " << b.ToString() << "\n";
+      VLOG(2) << "visiting boundary " << b.ToString() << "\n";
       if ((b.IsOutsideBranch() || InstructionWithinBranchIdentical(
                                       b.operands(), is_layout_sensitive_)) &&
           WorthHoisting(b.operands()[0])) {
         connected_boundaries_.push_back(b);
-        VLOG(1) << "boundary can be moved\n";
+        VLOG(2) << "boundary can be moved\n";
         int64 operand_count = (b.IsInsideBranch())
                                   ? b.operands()[0]->operand_count()
                                   : b.operands()[0]->users().size();
@@ -829,7 +872,7 @@ class GroupConnectedBoundaries {
           }
         }
       } else {
-        VLOG(1) << "boundary cannot be moved\n";
+        VLOG(2) << "boundary cannot be moved\n";
         visited_.insert(b.operands()[0]);
         new_boundaries_.push_back(b);
       }
@@ -876,7 +919,7 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
   auto move_in_or_out = connect.BoundariesToMoveInOrOut(cur_boundary);
   if (!move_in_or_out.empty()) {
     auto benefit = connect.BenefitForMovingBoundaries(move_in_or_out);
-    VLOG(1) << "benefit of moving in or out "
+    VLOG(2) << "benefit of moving in or out "
             << cur_boundary.operands()[0]->ToString() << ":" << benefit << "\n";
     if (benefit >= 0) {
       new_boundaries.clear();
@@ -899,9 +942,20 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
   // Gather all the conditional ops in the module ahead of time, to avoid
   // potential complications of modifying the code that affecting traversal.
   std::vector<HloInstruction*> conditional_ops;
+  // Track how many times each branch computation is shared.
+  absl::flat_hash_map<HloComputation*, int> conditional_computations;
   for (auto* comp : module->MakeComputationPostOrder()) {
     for (auto* instr : comp->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kConditional) {
+        int branch_count = instr->branch_count();
+        for (int i = 0; i < branch_count; ++i) {
+          HloComputation* branch_i = instr->branch_computation(i);
+          if (ContainsKey(conditional_computations, branch_i)) {
+            conditional_computations[branch_i]++;
+          } else {
+            conditional_computations[branch_i] = 0;
+          }
+        }
         conditional_ops.push_back(instr);
       }
     }
@@ -909,6 +963,17 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
 
   bool changed = false;
   for (HloInstruction* conditional : conditional_ops) {
+    int branch_count = conditional->branch_count();
+    // check for shared conditional computations
+    bool conditional_is_shared = false;
+    for (int i = 0; i < branch_count; ++i) {
+      HloComputation* branch_i = conditional->branch_computation(i);
+      if (conditional_computations[branch_i] > 0) {
+        conditional_is_shared = true;
+        break;
+      }
+    }
+
     // Boundaries to move out or to move into the branches.
     std::vector<Boundary> to_move_out, to_move_in, new_boundaries;
     // The conditional is moved into a worklist as the seed (starting point).
@@ -926,6 +991,33 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
       Boundary boundary = visitor.PopNextBoundary();
       VLOG(2) << "Analyzing boundary:" << boundary.ToString() << "\n";
       d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
+      if (d != Decision::kNoChange && conditional_is_shared) {
+        for (int i = 0; i < branch_count; ++i) {
+          HloComputation* branch_i = conditional->branch_computation(i);
+          if (conditional_computations[branch_i] > 0) {
+            // Cloning is absolutely needed if the computation is shared by
+            // different branches, but the cloning can be potentially avoided
+            // if the sharing is only among branches of the same conditional.
+            // If cloning these branches causes a problem due to space issues,
+            // a fix can pass a vector of unique branches to the actual
+            // transformations, as an alternative representation of the
+            // conditional branches to be modified. Right now we assume the
+            // overhead of cloning is minimal since later stages of the compiler
+            // inline all the computations anyway.
+            HloComputation* clone_i =
+                conditional->parent()->parent()->AddEmbeddedComputation(
+                    branch_i->Clone());
+            conditional->set_branch_computation(i, clone_i);
+            conditional_computations[branch_i]--;
+          }
+        }
+        to_move.clear();
+        next_boundary.clear();
+        VLOG(2) << "Cloned branches as needed: " << conditional->ToString()
+                << "\n";
+        // Need to reanalyze the cloned code to generate correct result.
+        d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
+      }
       switch (d) {
         case Decision::kMoveOutOfBranch:
           VLOG(2) << "Decision is move out of branch\n";
@@ -961,22 +1053,14 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
           MoveInstructionIn(conditional, to_move_in, new_boundaries));
       VLOG(2) << "moving in result:" << result << "\n";
       changed |= result;
-    }
-  }
-  // handling convert rematerialization/hoisting
-  if (!changed && pursue_full_conditional_code_motion_) {
-    std::vector<HloInstruction*> conditional_ops;
-    for (auto* comp : module->MakeComputationPostOrder()) {
-      for (auto* instr : comp->MakeInstructionPostOrder()) {
-        if (instr->opcode() == HloOpcode::kConditional) {
-          conditional_ops.push_back(instr);
-        }
-      }
-    }
-    for (HloInstruction* conditional_op : conditional_ops) {
+    } else if (pursue_full_conditional_code_motion_ && !conditional_is_shared) {
+      // Invoke special handling for convert rematerialization/hoisting
+      // We need to make sure no sharing is present in the branches because no
+      // cloning has been done by the earlier analysis.
+      // TOOD[b/165848866]: extend solution to handle cloning for special move.
       TF_ASSIGN_OR_RETURN(
           bool convert_result,
-          ConvertSpecialMove(conditional_op, is_layout_sensitive_));
+          ConvertSpecialMove(conditional, is_layout_sensitive_));
       changed |= convert_result;
     }
   }
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index b0a6ba92f48..b91f3813980 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -580,6 +580,154 @@ ENTRY main {
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
 }
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithSharedBranch) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  ROOT pow.1 = f32[10] power(get-first-index, get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithNonTupleRoot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  ROOT add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = f32[10]
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  ROOT pow.1 = f32[10] power(conditional, conditional)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithEmptyBranch) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch1 {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+branch2 {
+  ROOT arg_tuple.1 = (f32[10]) parameter(0)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch1,
+    false_computation=branch2
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  ROOT pow.1 = f32[10] power(get-first-index, get-first-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 4);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MovePowInWithNonTupleParameter) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg.1 = f32[10] parameter(0)
+  ROOT add.1 = f32[10] add(arg.1, arg.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = f32[10] parameter(1)
+  tuple.2 = f32[10] parameter(2)
+  conditional = f32[10]
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  ROOT pow.1 = f32[10] power(conditional, conditional)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 4);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 4);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+}
+
 }  // namespace conditional_opt
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7c362b2da44..b622b712f82 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -140,7 +140,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:tree_reduction_rewriter",
-        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:conditional_canonicalizer",
         "//tensorflow/compiler/xla/service:conditional_to_select",
         "//tensorflow/compiler/xla/service:slow_operation_alarm",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 39d2b11ad37..d8bf15ecdeb 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -85,7 +85,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -291,8 +290,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
       /*expansion_type=*/LogisticExpansionType::kExp);
   pipeline.AddPass<ConditionalCanonicalizer>();
   pipeline.AddPass<DynamicPadder>();
-  pipeline.AddPass<ScatterExpander>();
-  pipeline.AddPass<HloGetDimensionSizeRewriter>();
+  pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -624,6 +622,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
+  mlir_context.loadAllGloballyRegisteredDialects();
   llvm::LLVMContext llvm_context;
   auto llvm_module =
       absl::make_unique<llvm::Module>("__compute_module", llvm_context);
@@ -835,6 +834,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
+  mlir_context.loadAllGloballyRegisteredDialects();
   llvm::LLVMContext llvm_context;
   llvm::Module llvm_module("__compute_module", llvm_context);
   llvm_module.setDataLayout(target_machine->createDataLayout());
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 242f3c6ceb7..36566d6c25f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1640,7 +1640,7 @@ IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
 
     if (current_size_fragment >= vector_register_size_in_elements) {
       auto vector_type = llvm::VectorType::get(
-          element_ir_type, vector_register_size_in_elements);
+          element_ir_type, vector_register_size_in_elements, false);
       sharded_vector_type.insert(
           sharded_vector_type.end(),
           current_size_fragment / vector_register_size_in_elements,
@@ -1656,7 +1656,7 @@ IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
     // of two are all legal vector sizes (or at least can be lowered easily by
     // LLVM).
     sharded_vector_type.push_back(
-        llvm::VectorType::get(element_ir_type, current_size_fragment));
+        llvm::VectorType::get(element_ir_type, current_size_fragment, false));
   }
   return sharded_vector_type;
 }
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 8d9229c1223..3afdd9c163e 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -115,7 +115,7 @@ void RewriteCalls(
 
   // Upcast to vector type if input is a scalar.
   if (vector_width == 1) {
-    llvm::Type* v1_type = llvm::VectorType::get(input->getType(), 1);
+    llvm::Type* v1_type = llvm::VectorType::get(input->getType(), 1, false);
     input = b.CreateInsertElement(llvm::UndefValue::get(v1_type), input,
                                   uint64_t{0});
   }
@@ -264,8 +264,8 @@ llvm::Value* GenerateVF32Exp(llvm::IRBuilder<>* b, llvm::Value* input,
   z = vsl.Add(one, z);
 
   // Convert n' to an i32.  This is safe because we clamped it above.
-  llvm::Value* n_i32 =
-      b->CreateFPToSI(n, llvm::VectorType::get(b->getInt32Ty(), vector_width));
+  llvm::Value* n_i32 = b->CreateFPToSI(
+      n, llvm::VectorType::get(b->getInt32Ty(), vector_width, false));
 
   auto splat_i32 = [&](int32 v) {
     return b->CreateVectorSplat(vector_width, b->getInt32(v));
@@ -329,7 +329,7 @@ llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
   llvm::Value* vector_constant_23 =
       b->CreateVectorSplat(vector_width, b->getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b->getInt32Ty(), vector_width);
+      llvm::VectorType::get(b->getInt32Ty(), vector_width, false);
 
   llvm::Value* emm0 = b->CreateLShr(b->CreateBitCast(tmp0, i32_vector_type),
                                     vector_constant_23);
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 0d2eab9fd42..48aa32f6b8f 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -33,7 +33,7 @@ VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
   scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
       primitive_type, b_->GetInsertBlock()->getModule());
   scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
-  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
+  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size, false);
   vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
 }
 
@@ -155,7 +155,7 @@ llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
   int64 float_size_bits = data_layout.getTypeSizeInBits(scalar_type());
   llvm::Type* scalar_int_type = b()->getIntNTy(float_size_bits);
   if (vector) {
-    return llvm::VectorType::get(scalar_int_type, vector_size());
+    return llvm::VectorType::get(scalar_int_type, vector_size(), false);
   } else {
     return scalar_int_type;
   }
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index f1a0b0a4406..cbed232897f 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -276,7 +276,7 @@ class VectorSupportLibrary {
     llvm::Constant* scalar_value = llvm::ConstantFP::get(type->getContext(), f);
     if (llvm::isa<llvm::VectorType>(type)) {
       return llvm::ConstantVector::getSplat(
-          llvm::ElementCount(vector_size(), /*Scalable=*/false), scalar_value);
+          llvm::ElementCount::getFixed(vector_size()), scalar_value);
     }
     return scalar_value;
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index b0def1a2dd8..60d832a940a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -245,6 +245,7 @@ class DfsHloVisitorBase {
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
+  virtual Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
   virtual Status HandleFusion(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index b1d674fe467..3d1a9a3c894 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -198,6 +198,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
+  Status HandleDynamicReshape(HloInstructionPtr dynamic_reshape) override {
+    return DefaultAction(dynamic_reshape);
+  }
   Status HandleReshape(HloInstructionPtr reshape) override {
     return DefaultAction(reshape);
   }
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 36429d3d755..80f98775c01 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -97,6 +97,8 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleTranspose(HloInstruction* hlo) override;
 
+  Status HandleDynamicReshape(HloInstruction* hlo) override;
+
   Status HandleReshape(HloInstruction* hlo) override;
 
   Status HandleSort(HloInstruction* hlo) override;
@@ -621,6 +623,18 @@ Status DynamicDimensionInferenceVisitor::HandleClamp(HloInstruction* hlo) {
   return PassThroughDynamicDimension(hlo);
 }
 
+Status DynamicDimensionInferenceVisitor::HandleDynamicReshape(
+    HloInstruction* hlo) {
+  HloDynamicReshapeInstruction* dynamic_reshape =
+      Cast<HloDynamicReshapeInstruction>(hlo);
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->shape().is_dynamic_dimension(i)) {
+      parent_->SetDynamicSize(hlo, {}, i, dynamic_reshape->dim_sizes(i));
+    }
+  }
+  return Status::OK();
+}
+
 Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo,
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index b5a17619edf..69f64c31a2f 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -1248,5 +1248,34 @@ TEST_F(DynamicDimensionInferenceTest, InfersCustomOp) {
   EXPECT_TRUE(handler_called);
 }
 
+TEST_F(DynamicDimensionInferenceTest, DynamicReshapeOp) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {9}), "data_input"));
+  auto six = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(6)));
+  // Creates an input of shape [<=9], dynamic size is 6.
+  auto dynamic_input =
+      builder.AddInstruction(HloInstruction::CreateSetDimensionSize(
+          ShapeUtil::MakeShape(F32, {9}, {true}), input, six, 0));
+  auto dynamic_size = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(S32, {}), "size_param"));
+  auto three = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(3)));
+
+  // Reshape [<=9] into [3, <=3]
+
+  auto dynamic_reshape =
+      builder.AddInstruction(HloInstruction::CreateDynamicReshape(
+          ShapeUtil::MakeShape(F32, {3, 3}, {false, true}), dynamic_input,
+          {three, dynamic_size}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(dynamic_reshape, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(dynamic_reshape, {}, 1), dynamic_size);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index c1f9da599e8..9b4d24bbbe9 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -125,6 +127,58 @@ StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst,
   }
 }
 
+StatusOr<bool> ReplaceGetSize(
+    HloInstruction* instr,
+    DynamicDimensionInference* dynamic_dimension_inference) {
+  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+
+  TF_ASSIGN_OR_RETURN(auto legal_shape,
+                      ShapeInference::InferGetDimensionSizeShape(
+                          instr->operand(0)->shape(), instr->dimension()));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "legal_shape " << legal_shape.ToString();
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), S32));
+  HloInstruction* operand = instr->mutable_operand(0);
+  int64 dim = instr->dimension();
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
+    // The dependency between a instruction and its dynamic dimensions is not
+    // modeled in the IR. As instr is being replaced by dynamic_size, also tell
+    // dynamic dimension inference that the instruction is being replaced.
+    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(
+        instr, dynamic_size);
+  } else {
+    int32 size = instr->operand(0)->shape().dimensions(dim);
+    HloInstruction* new_instr = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(size)));
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(instr,
+                                                                    new_instr);
+  }
+  return true;
+}
+
+StatusOr<bool> ReplaceSetSize(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kSetDimensionSize) {
+    return false;
+  }
+
+  TF_RET_CHECK(Shape::Equal().IgnoreDynamicDimension()(
+      instr->shape(), instr->operand(0)->shape()))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "instruction operand shape " << instr->operand(0)->shape();
+  HloInstruction* operand = instr->mutable_operand(0);
+
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(operand));
+  return true;
+}
+
 bool ShouldSkipPadOnOperand(const HloInstruction* inst, int64 operand_num,
                             int64 dimension) {
   if ((inst->opcode() == HloOpcode::kReduceWindow ||
@@ -1236,6 +1290,18 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
             changed, RewriteDynamicReshape(inst, &dynamic_dimension_inference));
         continue;
       }
+
+      if (inst->opcode() == HloOpcode::kDynamicReshape) {
+        TF_ASSIGN_OR_RETURN(
+            changed, RewriteDynamicReshape(inst, &dynamic_dimension_inference));
+        auto* static_reshape =
+            computation->AddInstruction(HloInstruction::CreateReshape(
+                inst->shape(), inst->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(static_reshape));
+        TF_RETURN_IF_ERROR(dynamic_dimension_inference.ForwardDynamicSize(
+            inst, static_reshape, {}));
+        continue;
+      }
       for (int64 operand_num = 0; operand_num < inst->operand_count();
            ++operand_num) {
         HloInstruction* original_operand = inst->mutable_operand(operand_num);
@@ -1292,6 +1358,22 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
         /*require_dynamic_output=*/require_dynamic_output));
   }
 
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(
+          bool replaced_get_size,
+          ReplaceGetSize(instruction, &dynamic_dimension_inference));
+      changed = changed || replaced_get_size;
+    }
+  }
+
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(bool replaced_set_size, ReplaceSetSize(instruction));
+      changed = changed || replaced_set_size;
+    }
+  }
+
   HloDCE dce;
   TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
   VLOG(2) << "Post DynamicPadder HLO:";
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index e8f429d9db6..3855531a97b 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -380,10 +379,15 @@ class ExecutionTest : public HloTestBase {
   Literal PadAndExecute(std::unique_ptr<HloModule> module,
                         absl::Span<Literal* const> arguments,
                         bool slice_dynamic_output = true) {
+    if (!slice_dynamic_output) {
+      auto new_config = module->config();
+      new_config.mutable_entry_computation_layout()
+          ->mutable_result_layout()
+          ->ClearDynamicShape();
+      module->set_config(new_config);
+    }
     DynamicPadder padder(slice_dynamic_output);
     TF_CHECK_OK(padder.Run(module.get()).status());
-    HloGetDimensionSizeRewriter rewriter;
-    TF_CHECK_OK(rewriter.Run(module.get()).status());
     HloDCE dce;
     TF_CHECK_OK(dce.Run(module.get()).status());
     return ExecuteAndTransfer(std::move(module), arguments);
@@ -1179,6 +1183,84 @@ ENTRY main {
   EXPECT_EQ(result, expected);
 }
 
+XLA_TEST_F(ExecutionTest, DynamicReshapeDoubleDynamicDimensions) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+ENTRY main {
+  param = s32[2, 3, 3] parameter(0)
+  size = s32[] constant(2)
+  param_padded_partial = s32[2, <=3, 3] set-dimension-size(param, size),
+    dimensions={1}
+  param_padded = s32[2, <=3, <=3] set-dimension-size(param_padded_partial, size),
+    dimensions={2}
+  result_size = s32[] constant(8)
+  ROOT reshaped = s32[<=18] dynamic-reshape(param_padded, result_size)
+}
+)";
+
+  // First dimension (1) is dynamic. Since dynamic size is 0, result is also 0.
+  Literal operand = LiteralUtil::CreateR3<int32>(
+      {{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}, {{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}});
+  auto module = GetHloModule(hlo_text);
+
+  Literal result = PadAndExecute(std::move(module), {&operand}, false);
+  result.SetDynamicSize(0, 8);
+  // Padded data looks like this (P is padding which is ignored).
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  //
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  //
+  // Reshaping (with correct reshape rewriting) produces:
+  // [0, 1, 3, 4, 0, 1, 3, 4]
+  Literal expected = LiteralUtil::CreateR1<int32>({0, 1, 3, 4, 0, 1, 3, 4});
+
+  EXPECT_EQ(result, expected);
+}
+
+XLA_TEST_F(ExecutionTest, DynamicReshapeOutputDoubleDynamicDimensions) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+ENTRY main {
+  param = s32[18] parameter(0)
+  eight = s32[] constant(8)
+  param_dynamic = s32[<=18] set-dimension-size(param, eight), dimensions={0}
+  two = s32[] constant(2)
+  // every dimension has dynamic size two.
+  ROOT reshaped = s32[2, <=3, <=3] dynamic-reshape(param_dynamic, two, two, two)
+}
+)";
+  Literal operand = LiteralUtil::CreateR1<int32>(
+      {0, 1, 3, 4, 0, 1, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
+
+  auto module = GetHloModule(hlo_text);
+
+  Literal result = PadAndExecute(std::move(module), {&operand}, false);
+
+  result.SetDynamicSize(1, 2);
+  result.SetDynamicSize(2, 2);
+  // Padded operand is:
+  // [0, 1, 3, 4, 0, 1, 3, 4, P, P ....]
+  //
+  // Reshaping it should produce:
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  //
+  // [[0, 1, P]
+  // [3, 4, P]
+  // [P, P, P]]
+  Literal expected =
+      LiteralUtil::CreateR3<int32>({{{0, 1}, {3, 4}}, {{0, 1}, {3, 4}}});
+
+  EXPECT_EQ(result, expected);
+}
+
 XLA_TEST_F(ExecutionTest, SetGetDimensionSize) {
   const string hlo_text = R"(
 HloModule TensorFlowScatterV1
@@ -1371,5 +1453,70 @@ ENTRY main {
   EXPECT_EQ(result, expected);
 }
 
+namespace op = xla::testing::opcode_matchers;
+
+class HloDimensionSizeLegalizerTest : public HloTestBase {
+ protected:
+  HloDimensionSizeLegalizerTest() {}
+};
+
+TEST_F(HloDimensionSizeLegalizerTest, Ok) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = s32[] get-dimension-size(p), dimensions={0}
+  size1 = s32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = s32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloDimensionSizeLegalizerTest, GetSetSetDimensionSizeRewriter) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = s32[] get-dimension-size(p), dimensions={0}
+  p_copy = s32[3,4] copy(p)
+  p_copy_dynamic = s32[<=3, 4] set-dimension-size(p_copy, size0), dimensions={0}
+  size1 = s32[] get-dimension-size(p_copy_dynamic), dimensions={0}
+  ROOT mul = s32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloDimensionSizeLegalizerTest, IllegalType) {
+  auto module = ParseAndReturnUnverifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3]{0} parameter(0)
+  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+TEST_F(HloDimensionSizeLegalizerTest, IllegalDimension) {
+  auto module = ParseAndReturnUnverifiedModule(R"(
+HloModule _
+ENTRY gds {
+  p = f32[2,5] parameter(0)
+  ROOT gds = s32[] get-dimension-size(p), dimensions={2}
+})")
+                    .ValueOrDie();
+  DynamicPadder pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 074fbd92b27..d1d0827981e 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -254,6 +254,11 @@ cc_library(
         ":target_util",
         ":thunk",
         ":thunk_emitter",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/mlir/xla:hlo_utils",
+        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
+        "//tensorflow/compiler/mlir/xla:mlir_hlo_to_hlo",
+        "//tensorflow/compiler/mlir/xla:type_to_shape",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -291,6 +296,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -1159,6 +1166,7 @@ cc_library(
         ":target_constants",
         ":tree_reduction_rewriter",
         ":variadic_op_splitter",
+        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1186,7 +1194,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
-        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto_util",
@@ -1217,6 +1224,8 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Core",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:IR",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 60e4cb84b09..a499dc70e23 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -230,18 +230,15 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // This is done to avoid the duplication of expensive instructions, which
   // would occur if 'fusion' were merged into multiple users.
   //
-  // If 'fusion' has just one user, then an earlier fusion pass chose not to
-  // fuse this producer/consumer pair (likely because of expensive instruction
-  // re-use by the consumer), and so we honor that choice here as well.
-  //
-  // Moreover, if we are going to save a "lot" in memory bandwidth then we
+  // However, if we are going to save a "lot" in memory bandwidth then we
   // ignore how expensive the fusion instructions are.  The heuristic used to
   // determine "a lot" is the following: merging must reduce memory traffic by a
   // factor of 0.3, and the amount of memory accessed must not be entirely
   // trivial (above 1K).  This likely has room for improvement in the future.
 
   bool allow_expensive_ops =
-      merged_to_current_bytes_ratio < 0.3 && current_bytes_transferred > 1024;
+      fusion->user_count() == 1 ||
+      (merged_to_current_bytes_ratio < 0.3 && current_bytes_transferred > 1024);
 
   if (!allow_expensive_ops &&
       absl::c_any_of(fusion->fused_instructions(),
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 42891154c23..cc4894f4c00 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -398,6 +398,29 @@ TEST_F(FusionMergerTest, WillMergeExpensiveFusionsIfSavesMemory) {
   EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(FusionMergerTest, WillMergeExpensiveFusionsWithSingleConsumer) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule m
+
+    %f_b (p: f32[1024,1024,1024]) -> f32[1024,1024,1024] {
+      %p = f32[1024,1024,1024] parameter(0)
+      ROOT %t = f32[1024,1024,1024] tanh(%p)
+    }
+
+    %f_c (p: f32[1024,1024,1024]) -> f32[1024,1024,1024] {
+      %p = f32[1024,1024,1024] parameter(0)
+      ROOT %t = f32[1024,1024,1024] add(%p, %p)
+    }
+
+    ENTRY entry {
+      p0 = f32[1024,1024,1024] parameter(0)
+      f1 = f32[1024,1024,1024] fusion(p0), kind=kLoop, calls=%f_b
+      ROOT f2 = f32[1024,1024,1024] fusion(f1), kind=kLoop, calls=%f_c
+    })")
+                    .ValueOrDie();
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index f5bf7476059..77fcf2c59f7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/all_reduce_combiner.h"
@@ -81,7 +83,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -195,13 +196,12 @@ Status GpuCompiler::OptimizeHloModule(
           /*layout_sensitive=*/false,
           /*allow_mixed_precision=*/false);
 
-      pass.AddPass<HloGetDimensionSizeRewriter>();
-
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
       pass.AddPass<ZeroSizedHloElimination>();
 
       pass.AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
+      pass.AddPass<ScatterExpander>(ScatterExpander::kEliminateSimpleScatters);
 
       AlgebraicSimplifierOptions options;
       // When transposes appear in a fusion node, we can easily adjust the
@@ -516,15 +516,22 @@ static Status CompileModuleToLlvmIrImpl(
   DumpHloModuleIfEnabled(*hlo_module, **buffer_assignment,
                          "after_optimizations");
 
+  mlir::registerAllDialects();
+  mlir::MLIRContext mlir_context;
+
   IrEmitterContext ir_emitter_context(
       hlo_module, buffer_assignment->get(), platform_name, gpu_device_info,
-      cuda_compute_capability, profile_index_map, llvm_module->get());
+      cuda_compute_capability, profile_index_map, &mlir_context,
+      llvm_module->get());
 
   HloComputation* entry_computation = hlo_module->entry_computation();
-  IrEmitterUnnested ir_emitter(hlo_module->config(), entry_computation,
-                               &ir_emitter_context);
 
-  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter,
+      IrEmitterUnnested::Create(hlo_module->config(), entry_computation,
+                                &ir_emitter_context));
+
+  TF_RETURN_IF_ERROR(ir_emitter->EmitConstantGlobals());
 
   {
     XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
@@ -533,9 +540,10 @@ static Status CompileModuleToLlvmIrImpl(
     ThunkSequence thunk_sequence;
     absl::Span<HloInstruction* const> order = hlo_schedule->ThunkLaunchOrder();
     for (HloInstruction* instruction : order) {
-      TF_RETURN_IF_ERROR(instruction->Visit(&ir_emitter));
-      TF_RETURN_IF_ERROR(ir_emitter.Postprocess(instruction));
-      std::unique_ptr<ThunkSequence> thunks = ir_emitter.ConsumeThunkSequence();
+      TF_RETURN_IF_ERROR(instruction->Visit(ir_emitter.get()));
+      TF_RETURN_IF_ERROR(ir_emitter->Postprocess(instruction));
+      std::unique_ptr<ThunkSequence> thunks =
+          ir_emitter->ConsumeThunkSequence();
 
       // The invariants between each input HloInstruction* and output Thunk* are
       // not all explicitly checked, but at least we can document them here:
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
index 6287f1e3ca2..31f011fa734 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
@@ -23,26 +23,11 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<bool> GpuScatterExpander::Run(HloModule* module) {
-  auto is_nontrivial_scatter = [](HloInstruction* inst) {
-    // TODO(b/129698548): Scattering elements larger than 64 bits is not
-    // supported by XLA:GPU.
-    return inst->opcode() == HloOpcode::kScatter &&
-           inst->shape().element_type() == C128;
-  };
-
-  std::vector<HloInstruction*> scatter_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    absl::c_copy_if(computation->instructions(),
-                    std::back_inserter(scatter_instrs), is_nontrivial_scatter);
-  }
-
-  for (HloInstruction* inst : scatter_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(inst));
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
-  }
-
-  return !scatter_instrs.empty();
+bool GpuScatterExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  // TODO(b/129698548): Scattering elements larger than 64 bits is not
+  // supported by XLA:GPU.
+  return inst->opcode() == HloOpcode::kScatter &&
+         primitive_util::BitWidth(inst->shape().element_type()) > 64;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h
index 0818b32474f..92acb909729 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h
@@ -20,10 +20,17 @@ limitations under the License.
 
 namespace xla {
 
+// Legalizes scatters on the GPU.
 class GpuScatterExpander : public ScatterExpander {
  public:
+  // Although we pass kEliminateAllScatters, we override this behavior in
+  // InstruuctionMatchesPattern and select only some scatters to expand.
+  GpuScatterExpander() : ScatterExpander(kEliminateAllScatters) {}
+
   absl::string_view name() const override { return "gpu_scatter_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 5d38d1b727c..332db83b6ad 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -117,11 +117,11 @@ static bool HasMeaningfulName(llvm::Value* value) {
   return false;
 }
 
-llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
-                                              ShapeIndexView shape_index,
-                                              llvm::Value* ir_value) {
-  llvm::Type* pointee_type = llvm_ir::ShapeToIrType(
-      ShapeUtil::GetSubshape(hlo.shape(), shape_index), module_);
+llvm::Value* CastToTypedValue(const Shape& shape, llvm::Value* ir_value,
+                              llvm::IRBuilder<>* b) {
+  llvm::Type* pointee_type =
+      llvm_ir::ShapeToIrType(shape, b->GetInsertBlock()->getModule());
+
   llvm::Type* dest_type = pointee_type->getPointerTo();
 
   llvm::Value* typed_ir_value;
@@ -129,9 +129,17 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
     typed_ir_value = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
         llvm::cast<llvm::GlobalVariable>(ir_value), dest_type);
   } else {
-    typed_ir_value = b_->CreatePointerBitCastOrAddrSpaceCast(
+    typed_ir_value = b->CreatePointerBitCastOrAddrSpaceCast(
         ir_value, pointee_type->getPointerTo());
   }
+  return typed_ir_value;
+}
+
+llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
+                                              ShapeIndexView shape_index,
+                                              llvm::Value* ir_value) {
+  auto typed_ir_value = CastToTypedValue(
+      ShapeUtil::GetSubshape(hlo.shape(), shape_index), ir_value, b_);
   if (!HasMeaningfulName(ir_value)) {
     ir_value->setName(llvm_ir::IrName(&hlo, "raw"));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index 5eef6727801..3813ec6c949 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -116,6 +116,10 @@ class HloToIrBindings {
   llvm::Value* temp_buffer_base_ = nullptr;
 };
 
+// Converts `ir_value` with type i8* to a typed LLVM Value* based on `shape`.
+llvm::Value* CastToTypedValue(const Shape& shape, llvm::Value* ir_value,
+                              llvm::IRBuilder<>* b);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 6309d7fcdee..9d4ec358bd3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -433,7 +433,7 @@ llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
       builder->CreateZExt(
           builder->CreateBitCast(value, builder->getIntNTy(bit_width)),
           builder->getIntNTy(32 * num_segments)),
-      llvm::VectorType::get(builder->getInt32Ty(), num_segments));
+      llvm::VectorType::get(builder->getInt32Ty(), num_segments, false));
   for (int i = 0; i < num_segments; ++i) {
     llvm::Value* insert_val;
     if (target_triple.isNVPTX()) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index 9c43f80dc60..7d5a8d032e6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_CONTEXT_H_
 
 #include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
@@ -34,13 +35,15 @@ class IrEmitterContext {
       const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
       std::string platform_name, GpuDeviceInfo gpu_device_info,
       absl::optional<CudaComputeCapability> cuda_compute_capability,
-      const HloProfileIndexMap* profile_index_map, llvm::Module* llvm_module)
+      const HloProfileIndexMap* profile_index_map,
+      mlir::MLIRContext* mlir_context, llvm::Module* llvm_module)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
         cuda_compute_capability_(cuda_compute_capability),
         profile_index_map_(profile_index_map),
+        mlir_context_(mlir_context),
         llvm_module_(llvm_module) {}
   // Disallow copy and assign.
   IrEmitterContext(const IrEmitterContext&) = delete;
@@ -57,6 +60,7 @@ class IrEmitterContext {
     return cuda_compute_capability_;
   }
   const HloProfileIndexMap* profile_index_map() { return profile_index_map_; }
+  mlir::MLIRContext* mlir_context() { return mlir_context_; }
   llvm::Module* llvm_module() { return llvm_module_; }
   NameUniquer* name_uniquer() { return &name_uniquer_; }
 
@@ -67,6 +71,7 @@ class IrEmitterContext {
   GpuDeviceInfo gpu_device_info_;
   absl::optional<CudaComputeCapability> cuda_compute_capability_;
   const HloProfileIndexMap* profile_index_map_;
+  mlir::MLIRContext* mlir_context_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 61b78b6004d..f88c70b1a33 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -37,6 +37,13 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+#include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
@@ -144,13 +151,86 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 
+const BufferAllocation* GetAllocation(
+    mlir::BlockArgument func_arg, const BufferAssignment& buffer_assignment) {
+  auto func_op =
+      mlir::cast<mlir::FuncOp>(func_arg.getParentRegion()->getParentOp());
+  int64 allocation_index = func_op
+                               .getArgAttrOfType<mlir::IntegerAttr>(
+                                   func_arg.getArgNumber(), "lmhlo.alloc")
+                               .getValue()
+                               .getSExtValue();
+  return &buffer_assignment.GetAllocation(allocation_index);
+}
+
+StatusOr<BufferAllocation::Slice> GetAllocationSliceForMlir(
+    mlir::Value v, const BufferAssignment& buffer_assignment) {
+  int64 size = v.getType().cast<mlir::MemRefType>().getSizeInBits() / 8;
+
+  if (auto arg = v.dyn_cast<mlir::BlockArgument>()) {
+    return BufferAllocation::Slice(GetAllocation(arg, buffer_assignment), 0,
+                                   size);
+  }
+
+  // We match two patterns here:
+  // * v = ViewOp(arg);
+  // * v = StaticMemRefCastOp(ViewOp(arg));
+  if (mlir::Operation* op = v.getDefiningOp()) {
+    if (auto cast = mlir::dyn_cast<mlir::lmhlo::StaticMemRefCastOp>(op)) {
+      mlir::Value source = cast.getViewSource();
+      op = source.getDefiningOp();
+      if (!op) {
+        return Unimplemented("StaticMemRefCastOp has to wrap an op");
+      }
+    }
+    if (auto view = mlir::dyn_cast<mlir::ViewOp>(op)) {
+      return BufferAllocation::Slice(
+          GetAllocation(view.source().cast<mlir::BlockArgument>(),
+                        buffer_assignment),
+          mlir::cast<mlir::ConstantOp>(view.byte_shift().getDefiningOp())
+              .value()
+              .cast<mlir::IntegerAttr>()
+              .getValue()
+              .getSExtValue(),
+          size);
+    }
+    return Unimplemented("StaticMemRefCastOp has to wrap a ViewOp");
+  }
+
+  return Unimplemented(
+      "Operand has to be in the form of ViewOp(arg) or "
+      "StaticMemRefCastOp(ViewOp(arg))");
+}
+
+absl::string_view GetHloName(mlir::Operation* op) {
+  if (auto attr = op->getAttrOfType<mlir::StringAttr>("name")) {
+    auto ref = attr.getValue();
+    return absl::string_view(ref.data(), ref.size());
+  }
+  return "";
+}
+
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                                      const HloComputation* hlo_computation,
                                      IrEmitterContext* ir_emitter_context)
     : IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/false),
-      hlo_computation_(hlo_computation) {}
+      hlo_computation_(hlo_computation),
+      mlir_scratch_module_(mlir::ModuleOp::create(
+          mlir::Builder(ir_emitter_context->mlir_context()).getUnknownLoc())),
+      lhlo_scratch_emitter_(ir_emitter_context_->buffer_assignment(),
+                            *hlo_computation, mlir_scratch_module_.get()) {}
+
+StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
+    const HloModuleConfig& hlo_module_config,
+    const HloComputation* hlo_computation,
+    IrEmitterContext* ir_emitter_context) {
+  auto emitter = std::unique_ptr<IrEmitterUnnested>(new IrEmitterUnnested(
+      hlo_module_config, hlo_computation, ir_emitter_context));
+  TF_RETURN_IF_ERROR(emitter->lhlo_scratch_emitter_.Initialize());
+  return std::move(emitter);
+}
 
 Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) {
   bindings_.UnbindAllLocalIrValues();
@@ -158,12 +238,11 @@ Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) {
 }
 
 llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
-    const HloInstruction& inst,
-    absl::Span<const BufferAllocation* const> args) {
+    absl::string_view name, absl::Span<const BufferAllocation* const> args) {
   // Compute the kernel name. The opcode string may contain "-" which cannot be
   // in a PTX function name, so sanitize the name before uniquifying it.
   string kernel_name = ir_emitter_context_->name_uniquer()->GetUniqueName(
-      llvm_ir::SanitizeFunctionName(inst.name()));
+      llvm_ir::SanitizeFunctionName(std::string(name)));
 
   // Create the kernel and add it to the module.
   llvm::Module* module = ir_emitter_context_->llvm_module();
@@ -359,7 +438,8 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
 }
 
 Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
-  AddThunkToThunkSequence(BuildConditionalThunk(conditional));
+  TF_ASSIGN_OR_RETURN(auto thunk, BuildConditionalThunk(conditional));
+  AddThunkToThunkSequence(std::move(thunk));
   return Status::OK();
 }
 
@@ -1038,10 +1118,13 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
   // Build ForThunk for conformant while loops, otherwise build WhileThunk.
   auto config = xla_while->backend_config<WhileLoopBackendConfig>();
   if (config.ok() && config.ValueOrDie().has_known_trip_count()) {
-    AddThunkToThunkSequence(
+    TF_ASSIGN_OR_RETURN(
+        auto thunk,
         BuildForThunk(xla_while, config.ValueOrDie().known_trip_count().n()));
+    AddThunkToThunkSequence(std::move(thunk));
   } else {
-    AddThunkToThunkSequence(BuildWhileThunk(xla_while));
+    TF_ASSIGN_OR_RETURN(auto thunk, BuildWhileThunk(xla_while));
+    AddThunkToThunkSequence(std::move(thunk));
   }
   return Status::OK();
 }
@@ -1264,39 +1347,109 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
   return IrEmitter::HandleSelect(select);
 }
 
+StatusOr<const HloComputation*>
+IrEmitterUnnested::GetOrCreateSubComputationFromRegion(mlir::Region* region) {
+  std::unique_ptr<HloModule>& module = scratch_nested_computations_[region];
+  if (module == nullptr) {
+    xla::XlaComputation xla_computation;
+    TF_RETURN_IF_ERROR(ConvertRegionToComputation(region, &xla_computation));
+    TF_ASSIGN_OR_RETURN(auto program_shape, xla_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        module, HloModule::CreateFromProto(xla_computation.proto(),
+                                           HloModuleConfig(program_shape)));
+  }
+  return module->entry_computation();
+}
+
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
+  MlirEmitterInput result;
+
+  TF_ASSIGN_OR_RETURN(auto sort_op, lhlo_scratch_emitter_.EmitSortOp(sort));
+  result.op = sort_op;
+  result.name = GetHloName(sort_op);
+  // The name in sort op has no semantics, and it's for debug only. If the name
+  // doesn't exist, we should use a namer (e.g. count-based).
+  // TODO(timshen): use a namer instead of relying on the HloInstruction names.
+  if (result.name.empty()) {
+    result.name = sort->name();
+  }
+  const auto& buffer_assignment = ir_emitter_context_->buffer_assignment();
+  auto& slice = result.extra_slice;
+  TF_ASSIGN_OR_RETURN(slice.buffer_slice,
+                      buffer_assignment.GetUniqueSlice(sort, {}));
+  slice.written = true;
+  slice.shape = sort->shape();
+
+  result.thunk_info = GetThunkInfo(sort);
+
+  return EmitMlirSort(result);
+}
+
+Status IrEmitterUnnested::EmitMlirSort(MlirEmitterInput input) {
+  const auto& buffer_assignment = ir_emitter_context_->buffer_assignment();
+  auto sort_op = mlir::cast<mlir::lmhlo::SortOp>(input.op);
+
+  int operand_count = sort_op.operands().size();
+  std::vector<xla::Shape> operand_shapes(operand_count);
+  std::vector<MlirBufferSlice> slices;
+  std::vector<xla::Shape> output_shapes(sort_op.output().size());
+
+  for (int i = 0; i < operand_count; i++) {
+    operand_shapes[i] =
+        TypeToShape(sort_op.operands()[i].getType().cast<mlir::MemRefType>());
+  }
+
+  // Craft n + 1 slices, where the first n are output parameters, and the last
+  // is the on-device tuple storage. We don't need n operands because sorting
+  // kernels are always in-place.
+  for (int i = 0; i < operand_count; i++) {
+    output_shapes[i] =
+        TypeToShape(sort_op.output()[i].getType().cast<mlir::MemRefType>());
+    MlirBufferSlice slice;
+    TF_ASSIGN_OR_RETURN(
+        slice.buffer_slice,
+        GetAllocationSliceForMlir(sort_op.output()[i], buffer_assignment));
+    slice.written = true;
+    slice.shape = operand_shapes[i];
+    slices.push_back(slice);
+  }
+  slices.push_back(input.extra_slice);
+
   std::vector<std::unique_ptr<Thunk>> thunks;
-  Shape keys_shape = sort->operand(0)->shape();
-  int64 dimension_to_sort = sort->dimensions(0);
-  for (int64 i = 0; i < sort->operand_count(); ++i) {
-    ShapeIndex shape_index =
-        sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+
+  Shape keys_shape = operand_shapes[0];
+  int64 dimension_to_sort = sort_op.dimension().getSExtValue();
+  for (int64 i = 0; i < operand_count; ++i) {
     // We assume that the layout of all involved operands and outputs is the
     // same.
-    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(keys_shape,
-                                                  sort->operand(i)->shape()));
-    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
-        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index)));
+    TF_RET_CHECK(
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, operand_shapes[i]));
+    TF_RET_CHECK(
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, output_shapes[i]));
 
     // If possible, we share buffers. If that is not possible, we need to copy
     // the values, because the emitter does the sorting in-place.
-    auto destination_buffer = GetAllocationSlice(*sort, shape_index);
-    auto source_address = GetAllocationSlice(*sort->operand(i));
+    TF_ASSIGN_OR_RETURN(
+        auto destination_buffer,
+        GetAllocationSliceForMlir(sort_op.output()[i], buffer_assignment));
+    TF_ASSIGN_OR_RETURN(
+        auto source_address,
+        GetAllocationSliceForMlir(sort_op.operands()[i], buffer_assignment));
     if (destination_buffer != source_address) {
       // TODO(b/26783907): Figure out why we never seem to share buffers for
       // key/value sort.
-      VLOG(2) << sort->name() << " requires initial D2D copy for operand " << i;
+      VLOG(2) << input.name << " requires initial D2D copy for operand " << i;
       thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo(),
           /*source_address=*/source_address,
           /*destination_buffer=*/destination_buffer,
-          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(i)->shape())));
+          /*mem_size=*/ShapeUtil::ByteSizeOf(operand_shapes[i])));
     }
   }
 
   uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
-  VLOG(2) << sort->name() << " requires " << num_stages << " stages.";
+  VLOG(2) << input.name << " requires " << num_stages << " stages.";
   CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
   CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
@@ -1360,10 +1513,10 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   // we have not enough threads, or not enough shared memory. Also it does not
   // give a speedup if the tile size is < 128.
   int64 total_shared_memory_needed = 0;
-  for (int64 i = 0; i < sort->operand_count(); ++i) {
+  for (int64 i = 0; i < operand_count; ++i) {
     total_shared_memory_needed +=
-        kTileSize * ShapeUtil::ByteSizeOfPrimitiveType(
-                        sort->operand(i)->shape().element_type());
+        kTileSize *
+        ShapeUtil::ByteSizeOfPrimitiveType(operand_shapes[i].element_type());
   }
   bool no_tiling =
       kTileSize < 128 ||
@@ -1376,7 +1529,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
       "kTileSize=%d < 128, "
       "kThreadsPerBlock=%d > threads_per_block_limit=%d, "
       "total_shared_memory_needed=%d > shared_memory_per_block=%d",
-      sort->name(), (no_tiling ? "won't" : "will"), kTileSize, kThreadsPerBlock,
+      input.name, (no_tiling ? "won't" : "will"), kTileSize, kThreadsPerBlock,
       ir_emitter_context_->gpu_device_info().threads_per_block_limit,
       total_shared_memory_needed,
       ir_emitter_context_->gpu_device_info().shared_memory_per_block);
@@ -1384,37 +1537,38 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
   LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
   VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
-                                sort->name(), num_blocks, kThreadsPerBlock);
+                                input.name, num_blocks, kThreadsPerBlock);
 
+  std::vector<llvm_ir::IrArray> ir_arrays;
   auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
     VLOG(2) << absl::StreamFormat(
-        "%s uses kernel for xor masks [%s]", sort->name(),
+        "%s uses kernel for xor masks [%s]", input.name,
         absl::StrJoin(xor_masks, ", ", [](std::string* out, int64 xor_mask) {
           absl::StrAppendFormat(out, "0x%x", xor_mask);
         }));
-    thunks.push_back(
-        BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+    thunks.push_back(BuildKernelThunkForMlir(input.name, Thunk::ThunkInfo(),
+                                             slices, &ir_arrays));
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
     UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
                            ir_emitter_context_->llvm_module());
     std::vector<IrArray> values_arrays;
-    values_arrays.reserve(sort->operand_count());
-    for (int64 i = 0; i < sort->operand_count(); ++i) {
-      ShapeIndex shape_index =
-          sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-      values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+    values_arrays.reserve(operand_count);
+    for (int64 i = 0; i < operand_count; ++i) {
+      values_arrays.push_back(ir_arrays[i]);
     }
+    TF_ASSIGN_OR_RETURN(
+        const HloComputation* comparator,
+        GetOrCreateSubComputationFromRegion(&sort_op.comparator()));
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, values_arrays, IrName(sort), xor_masks, &b_,
+        dimension_to_sort, values_arrays, IrName(input.name), xor_masks, &b_,
         launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
         kTileSize,
         [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
-          return EmitCallToNestedComputation(*sort->to_apply(), operands,
-                                             output);
+          return EmitCallToNestedComputation(*comparator, operands, output);
         });
   };
   std::vector<int64> xor_masks;
@@ -1441,17 +1595,18 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
     TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
   }
   VLOG(2) << absl::StreamFormat(
-      "%s requires %d thunks (including any D2D copies)", sort->name(),
+      "%s requires %d thunks (including any D2D copies)", input.name,
       thunks.size());
 
-  AddThunkToThunkSequence(absl::make_unique<SequentialThunk>(
-      GetThunkInfo(sort), std::move(thunks)));
-  if (sort->operand_count() > 1) {
+  AddThunkToThunkSequence(
+      absl::make_unique<SequentialThunk>(input.thunk_info, std::move(thunks)));
+  if (operand_count > 1) {
     // Emit the tuple as part of the last stage of sorting.
     // We are currently in the block sorted.in_bounds.after.
     b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
-    llvm_ir::EmitTuple(GetIrArray(*sort, *sort),
-                       ConstructIrArrayForOutputs(*sort), &b_);
+    llvm_ir::EmitTuple(
+        ir_arrays[operand_count],
+        absl::MakeSpan(ir_arrays).subspan(0, ir_arrays.size() - 1), &b_);
   }
   return Status::OK();
 }
@@ -1589,24 +1744,6 @@ Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
-// Describes how to access a particular subshape for an HLO.  For instance if
-// `.hlo_index` is {1} and `.gte_index` is {3, 4} then buffer for `.instr` at
-// ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo) is found
-// at `.buffer_slice`[3][4].  That is, `.slice` is a void***, which we
-// dereference twice -- first at index 3, and then at index 4 -- to get the
-// address of our buffer.
-struct HloBufferSlice {
-  const HloInstruction* instr;
-  ShapeIndex hlo_index;
-
-  // The root buffer to look at.
-  BufferAllocation::Slice buffer_slice;
-
-  // Describes how to dereference starting at that buffer to get to the buffer
-  // in question.
-  ShapeIndex gte_index;
-};
-
 // Figures out how to access the buffers for all subshapes of hlo's operands and
 // for hlo itself (i.e. all the buffers produced by HLO).
 //
@@ -1715,22 +1852,22 @@ static std::vector<HloBufferSlice> GetHloBufferSlices(
   return result;
 }
 
-std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst, bool implements_whole_instruction) {
-  const BufferAssignment& buffer_assn =
-      ir_emitter_context_->buffer_assignment();
-
-  std::vector<HloBufferSlice> hlo_slices =
-      GetHloBufferSlices(inst, buffer_assn);
+std::unique_ptr<KernelThunk>
+IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
+    absl::string_view name, Thunk::ThunkInfo thunk_info,
+    absl::Span<const BufferSlice* const> slices,
+    std::function<void(const BufferSlice*, llvm::Value*)>
+        bind_slice_to_ir_value) {
+  const auto& buffer_assn = ir_emitter_context_->buffer_assignment();
 
   // Figure out which buffer allocations need to be passed as arguments to our
-  // kernel.  This is simply all of the allocations referenced in hlo_slices,
+  // kernel.  This is simply all of the allocations referenced in slices,
   // plus the XLA temp buffer (if we have it).  We always include the temp
   // buffer because even if the kernel itself doesn't use it, a nested
   // subcomputation within the kernel (e.g. a kMap's computation) might.
   std::unordered_set<const BufferAllocation*> buffers_needed;
-  for (const auto& hlo_buffer_slice : hlo_slices) {
-    buffers_needed.insert(hlo_buffer_slice.buffer_slice.allocation());
+  for (auto* slice : slices) {
+    buffers_needed.insert(slice->buffer_slice.allocation());
   }
   absl::optional<const BufferAllocation*> temp_buffer;
   for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
@@ -1759,7 +1896,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
                  return a->index() < b->index();
                });
 
-  llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
+  llvm::Function* kernel = BuildKernelPrototype(name, non_constant_buffers);
 
   // Build a map from a BufferAllocation to the corresponding argument in our
   // kernel.
@@ -1793,24 +1930,19 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
 
   // For each buffer our kernel might want to touch, bind it to a value derived
   // from our kernel args.
-  for (const auto& hlo_buffer_slice : hlo_slices) {
-    const HloInstruction* instr = hlo_buffer_slice.instr;
-    const ShapeIndex& index = hlo_buffer_slice.hlo_index;
-    const BufferAllocation::Slice& slice = hlo_buffer_slice.buffer_slice;
-    const ShapeIndex& gte_index = hlo_buffer_slice.gte_index;
-
-    VLOG(3) << "Buffer for " << instr->ToString() << " at " << index.ToString()
-            << " is found in slice " << slice.ToString() << " at GTE index "
-            << gte_index.ToString();
+  for (auto* slice : slices) {
+    const BufferAllocation::Slice& buffer_slice = slice->buffer_slice;
+    const ShapeIndex& gte_index = slice->gte_index;
 
     llvm::Value* loc;
-    if (slice.allocation()->is_constant()) {
+    if (buffer_slice.allocation()->is_constant()) {
       loc = ir_emitter_context_->llvm_module()->getGlobalVariable(
-          llvm_ir::ConstantBufferAllocationToGlobalName(*slice.allocation()));
+          llvm_ir::ConstantBufferAllocationToGlobalName(
+              *buffer_slice.allocation()));
       CHECK_NE(loc, nullptr);
     } else {
-      loc = InBoundsGEP(kernel_args.at(slice.allocation()),
-                        {b_.getInt64(slice.offset())});
+      loc = InBoundsGEP(kernel_args.at(buffer_slice.allocation()),
+                        {b_.getInt64(buffer_slice.offset())});
     }
 
     // If gte_index is nonempty, we have to dereference `loc` to get to the
@@ -1822,7 +1954,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
       loc = Load(InBoundsGEP(loc, {b_.getInt64(idx)}));
     }
 
-    bindings_.BindHloToIrValue(*instr, loc, index);
+    bind_slice_to_ir_value(slice, loc);
   }
 
   // Bind the temp buffer so that nested subcomputations can find it if they
@@ -1834,9 +1966,66 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
         llvm::ConstantPointerNull::get(b_.getInt8PtrTy()));
   }
 
-  return absl::make_unique<KernelThunk>(
+  return absl::make_unique<KernelThunk>(thunk_info, non_constant_buffers,
+                                        std::string(kernel->getName()));
+}
+
+std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
+    const HloInstruction* inst, bool implements_whole_instruction) {
+  std::vector<HloBufferSlice> hlo_slices =
+      GetHloBufferSlices(inst, ir_emitter_context_->buffer_assignment());
+
+  std::vector<BufferSlice*> slice_ptrs;
+  slice_ptrs.reserve(hlo_slices.size());
+  for (auto& slice : hlo_slices) {
+    slice_ptrs.push_back(&slice);
+  }
+
+  return BuildKernelThunkFromBufferSlices(
+      inst->name(),
       implements_whole_instruction ? GetThunkInfo(inst) : Thunk::ThunkInfo(),
-      non_constant_buffers, std::string(kernel->getName()));
+      slice_ptrs, [this](const BufferSlice* slice, llvm::Value* value) {
+        const HloBufferSlice* hlo_buffer_slice =
+            static_cast<const HloBufferSlice*>(slice);
+        const HloInstruction* instr = hlo_buffer_slice->instr;
+        const ShapeIndex& index = hlo_buffer_slice->hlo_index;
+        VLOG(3) << "Buffer for " << instr->ToString() << " at "
+                << index.ToString() << " is found in slice "
+                << hlo_buffer_slice->buffer_slice.ToString() << " at GTE index "
+                << hlo_buffer_slice->gte_index.ToString();
+
+        bindings_.BindHloToIrValue(*instr, value, index);
+      });
+}
+
+std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunkForMlir(
+    absl::string_view name, Thunk::ThunkInfo thunk_info,
+    absl::Span<const MlirBufferSlice> slices,
+    std::vector<llvm_ir::IrArray>* ir_arrays) {
+  absl::flat_hash_set<BufferAllocation::Slice> buffers_written;
+  std::vector<const BufferSlice*> slice_ptrs;
+  slice_ptrs.reserve(slices.size());
+  for (auto& slice : slices) {
+    slice_ptrs.push_back(&slice);
+    if (slice.written) {
+      buffers_written.insert(slice.buffer_slice);
+    }
+  }
+
+  ir_arrays->clear();
+  return BuildKernelThunkFromBufferSlices(
+      name, thunk_info, slice_ptrs,
+      [&](const BufferSlice* slice, llvm::Value* value) {
+        const auto& mlir_slice = static_cast<const MlirBufferSlice&>(*slice);
+
+        llvm_ir::IrArray ir_array(
+            CastToTypedValue(mlir_slice.shape, value, &b_), mlir_slice.shape);
+        if (!buffers_written.contains(slice->buffer_slice)) {
+          ir_array.MarkInvariantOverWholeProgram(&value->getContext());
+        }
+
+        ir_arrays->push_back(ir_array);
+      });
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
@@ -2043,7 +2232,7 @@ Status CheckConditionalBuffersShareAllocation(
 
 }  // namespace
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
     const HloInstruction* hlo) {
   // Check that all while-related buffers share an allocation.
   TF_CHECK_OK(CheckWhileBuffersShareAllocation(
@@ -2051,24 +2240,26 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
 
   // Generate thunk sequence for while 'condition'.
   HloComputation* condition = hlo->while_condition();
-  IrEmitterUnnested ir_emitter_condition(hlo_module_config_, condition,
-                                         ir_emitter_context_);
-  TF_CHECK_OK(condition->Accept(&ir_emitter_condition));
+  TF_ASSIGN_OR_RETURN(auto ir_emitter_condition,
+                      IrEmitterUnnested::Create(hlo_module_config_, condition,
+                                                ir_emitter_context_));
+  TF_RETURN_IF_ERROR(condition->Accept(ir_emitter_condition.get()));
 
   // Generate thunk sequence for while 'body'.
   HloComputation* body = hlo->while_body();
-  IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
-                                    ir_emitter_context_);
-  TF_CHECK_OK(body->Accept(&ir_emitter_body));
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter_body,
+      IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
+  TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
-  return absl::make_unique<WhileThunk>(
+  return std::unique_ptr<Thunk>(new WhileThunk(
       GetThunkInfo(hlo),
       GetAllocationSlice(*condition->root_instruction()),  // cond result
-      ir_emitter_condition.ConsumeThunkSequence(),
-      ir_emitter_body.ConsumeThunkSequence());
+      ir_emitter_condition->ConsumeThunkSequence(),
+      ir_emitter_body->ConsumeThunkSequence()));
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
     const HloInstruction* hlo, const int64 loop_limit) {
   // Check that all while-related buffers share an allocation.
   TF_CHECK_OK(CheckWhileBuffersShareAllocation(
@@ -2076,15 +2267,16 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
 
   // Generate thunk sequence for while 'body' (will be used a For loop body).
   HloComputation* body = hlo->while_body();
-  IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
-                                    ir_emitter_context_);
-  TF_CHECK_OK(body->Accept(&ir_emitter_body));
+  TF_ASSIGN_OR_RETURN(
+      auto ir_emitter_body,
+      IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
+  TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
-  return absl::make_unique<ForThunk>(GetThunkInfo(hlo), loop_limit,
-                                     ir_emitter_body.ConsumeThunkSequence());
+  return std::unique_ptr<Thunk>(new ForThunk(
+      GetThunkInfo(hlo), loop_limit, ir_emitter_body->ConsumeThunkSequence()));
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
     const HloInstruction* hlo) {
   // Check that the buffers used in conditional are shared with the operands and
   // result appropriately.
@@ -2096,15 +2288,17 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
   for (int j = 0; j < hlo->branch_count(); ++j) {
     branch_operands.emplace_back(GetAllocationSlice(*hlo->operand(j + 1)));
     HloComputation* branch_computation = hlo->branch_computation(j);
-    IrEmitterUnnested ir_emitter(hlo_module_config_, branch_computation,
-                                 ir_emitter_context_);
-    TF_CHECK_OK(branch_computation->Accept(&ir_emitter));
-    branch_thunks.push_back(std::move(*ir_emitter.ConsumeThunkSequence()));
+    TF_ASSIGN_OR_RETURN(
+        auto ir_emitter,
+        IrEmitterUnnested::Create(hlo_module_config_, branch_computation,
+                                  ir_emitter_context_));
+    TF_CHECK_OK(branch_computation->Accept(ir_emitter.get()));
+    branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
   }
 
-  return absl::make_unique<ConditionalThunk>(
+  return std::unique_ptr<Thunk>(new ConditionalThunk(
       GetThunkInfo(hlo), GetAllocationSlice(*hlo->operand(0)), branch_operands,
-      std::move(branch_thunks));
+      std::move(branch_thunks)));
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 019fcdf21db..b9146dd8fae 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
@@ -28,6 +29,40 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct BufferSlice {
+  // The root buffer to look at.
+  BufferAllocation::Slice buffer_slice;
+
+  // Describes how to dereference starting at that buffer to get to the buffer
+  // in question.
+  ShapeIndex gte_index;
+};
+
+// Describes how to access a particular subshape for an HLO.  For instance if
+// `.hlo_index` is {1} and `.gte_index` is {3, 4} then buffer for `.instr` at
+// ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo) is
+// found at `.buffer_slice`[3][4].  That is, `.slice` is a void***, which we
+// dereference twice -- first at index 3, and then at index 4 -- to get the
+// address of our buffer.
+struct HloBufferSlice : public BufferSlice {
+  const HloInstruction* instr;
+  ShapeIndex hlo_index;
+};
+
+struct MlirBufferSlice : public BufferSlice {
+  // The buffer is modified by the kernel.
+  bool written;
+
+  Shape shape;
+};
+
+struct MlirEmitterInput {
+  mlir::Operation* op;
+  absl::string_view name;
+  Thunk::ThunkInfo thunk_info;
+  MlirBufferSlice extra_slice;
+};
+
 // Emits LLVM IR for an "unnested computation".
 //
 // An unnested computation is an HloComputation which you run by executing one
@@ -89,12 +124,14 @@ class IrEmitterUnnested : public IrEmitter,
       const string& loop_name, llvm::Value* tile_height,
       llvm::Value* tile_width, KernelSupportLibrary* ksl)>;
 
-  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
-                    const HloComputation* hlo_computation,
-                    IrEmitterContext* ir_emitter_context);
   IrEmitterUnnested(const IrEmitterUnnested&) = delete;
   IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
 
+  static StatusOr<std::unique_ptr<IrEmitterUnnested>> Create(
+      const HloModuleConfig& hlo_module_config,
+      const HloComputation* hlo_computation,
+      IrEmitterContext* ir_emitter_context);
+
   // Transfers the ownship of thunk_sequence_ out.
   std::unique_ptr<ThunkSequence> ConsumeThunkSequence() {
     return std::make_unique<ThunkSequence>(std::move(thunk_sequence_));
@@ -124,6 +161,7 @@ class IrEmitterUnnested : public IrEmitter,
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
+  Status EmitMlirSort(MlirEmitterInput input);
   Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleAllReduce(HloInstruction* crs) override;
@@ -148,6 +186,10 @@ class IrEmitterUnnested : public IrEmitter,
   Status Postprocess(HloInstruction* hlo) override;
 
  private:
+  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
+                    const HloComputation* hlo_computation,
+                    IrEmitterContext* ir_emitter_context);
+
   // Add a owning Thunk object to the thunk sequence.
   void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) override {
     thunk_sequence_.emplace_back(std::move(thunk));
@@ -264,8 +306,7 @@ class IrEmitterUnnested : public IrEmitter,
   // Builds the prototype of the IR kernel for `inst` and adds it to the module.
   // This kernel takes as arguments pointers to the given buffer allocations.
   llvm::Function* BuildKernelPrototype(
-      const HloInstruction& inst,
-      absl::Span<const BufferAllocation* const> args);
+      absl::string_view name, absl::Span<const BufferAllocation* const> args);
 
   // Helper for writing extra outputs from inside a reduce kernel.
   Status EmitExtraOutputsForReduce(
@@ -490,6 +531,12 @@ class IrEmitterUnnested : public IrEmitter,
       HloComputation* reducer, llvm::Type* element_type,
       llvm::Value* partial_result_address);
 
+  std::unique_ptr<KernelThunk> BuildKernelThunkFromBufferSlices(
+      absl::string_view name, Thunk::ThunkInfo thunk_info,
+      absl::Span<const BufferSlice* const> slices,
+      std::function<void(const BufferSlice*, llvm::Value*)>
+          bind_slice_to_ir_value);
+
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
   // Thunk object. 'implements_whole_instruction' specifies whether this
@@ -498,6 +545,11 @@ class IrEmitterUnnested : public IrEmitter,
   std::unique_ptr<KernelThunk> BuildKernelThunk(
       const HloInstruction* inst, bool implements_whole_instruction);
 
+  std::unique_ptr<KernelThunk> BuildKernelThunkForMlir(
+      absl::string_view name, Thunk::ThunkInfo thunk_info,
+      absl::Span<const MlirBufferSlice> slices,
+      std::vector<llvm_ir::IrArray>* ir_arrays);
+
   // Returns a thunk that, given a reduce or select-and-scatter op,
   // initializes its memory to the appropriate initial value.
   StatusOr<std::unique_ptr<Thunk>> BuildInitializerThunk(
@@ -505,17 +557,18 @@ class IrEmitterUnnested : public IrEmitter,
 
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
+  StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(const HloInstruction* hlo);
 
   // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
   // sequence from the 'body' sub-computation of the while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
-                                       const int64 loop_limit);
+  StatusOr<std::unique_ptr<Thunk>> BuildForThunk(const HloInstruction* hlo,
+                                                 const int64 loop_limit);
 
   // Returns a ConditionalThunk which executes the thunk sequence for the
   // 'branch_computation' corresponding to the predicate/branch_index of the
   // given conditional instruction.
-  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
+  StatusOr<std::unique_ptr<Thunk>> BuildConditionalThunk(
+      const HloInstruction* hlo);
 
   // Emits current thread id with the given type.
   //
@@ -545,6 +598,9 @@ class IrEmitterUnnested : public IrEmitter,
       absl::optional<int64> thread_id_filter = absl::nullopt,
       absl::optional<int64> block_id_filter = absl::nullopt);
 
+  StatusOr<const HloComputation*> GetOrCreateSubComputationFromRegion(
+      mlir::Region* region);
+
   // Returns the last generated thunk.
   Thunk* LastThunk() const { return thunk_sequence_.back().get(); }
 
@@ -555,6 +611,14 @@ class IrEmitterUnnested : public IrEmitter,
 
   // The HloComputation that this IrEmitter emits code for.
   const HloComputation* hlo_computation_;
+
+  mlir::OwningModuleRef mlir_scratch_module_;
+
+  // This is for cache-purpose only. It has no significant semantics.
+  mlir::LhloDialectEmitter lhlo_scratch_emitter_;
+
+  absl::flat_hash_map<const mlir::Region*, std::unique_ptr<HloModule>>
+      scratch_nested_computations_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 1228a1b4823..04af67a70b9 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -62,8 +62,10 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace xla {
 namespace gpu {
@@ -86,14 +88,21 @@ static string GetSmName(std::pair<int, int> compute_capability) {
   int sm_version = 30;
   // If the current compute capability isn't known, fallback to the
   // most recent version before it.
-  for (int v : {75, 72, 70, 62, 61, 60, 53, 52, 50, 37, 35, 32, 30}) {
+  int supported_versions[] = {75, 72, 70, 62, 61, 60, 53,
+                              52, 50, 37, 35, 32, 30};
+  for (int v : supported_versions) {
     if (v <= compute_capability_version) {
       sm_version = v;
       break;
     }
   }
 
-  if (sm_version != compute_capability_version) {
+  // If the current CC isn't supported by LLVM and it is newer then
+  // the max supported LLVM version, do not warn about it. The end
+  // user can't do anything about this. PTX compiled for SM75 will
+  // run on SM80 too.
+  if (sm_version != compute_capability_version &&
+      compute_capability_version < supported_versions[0]) {
     LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
                  << ", " << compute_capability.second << ") ."
                  << "Defaulting to telling LLVM that we're compiling for sm_"
@@ -570,6 +579,60 @@ static std::vector<string> GetROCDLPaths(int amdgpu_version,
   return result;
 }
 
+struct HsacoCacheEntry {
+  uint64 hash;
+  std::string ir;
+  int gfx;
+  std::vector<uint8> hsaco;
+};
+
+struct HsacoCache {
+ protected:
+  std::vector<HsacoCacheEntry> cache;
+  std::mutex m_mutex;
+  int request_count = 0;
+  int hit_count = 0;
+
+ public:
+  static bool Find(const std::string& ir, uint64_t& hash, int gfx,
+                   std::vector<uint8>& hsaco);
+  static void Add(const std::string& ir, uint64_t hash, int gfx,
+                  const std::vector<uint8>& hsaco);
+};
+
+static HsacoCache g_hsacoCache;
+
+bool HsacoCache::Find(const std::string& ir, uint64_t& hash, int gfx,
+                      std::vector<uint8>& hsaco) {
+  std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
+  hash = std::hash<std::string>{}(ir);
+  bool hit = false;
+  for (auto& x : g_hsacoCache.cache) {
+    if (x.hash != hash) continue;
+    if (x.gfx != gfx) continue;
+    if (x.ir != ir) continue;
+    hsaco = x.hsaco;
+    hit = true;
+    break;
+  }
+  g_hsacoCache.request_count++;
+  if (hit) g_hsacoCache.hit_count++;
+  if (!(g_hsacoCache.request_count % 50))
+    VLOG(1) << "HSACO cache: " << g_hsacoCache.request_count << " requests, "
+            << g_hsacoCache.hit_count << " hits";
+  return hit;
+}
+
+void HsacoCache::Add(const std::string& ir, uint64_t hash, int gfx,
+                     const std::vector<uint8>& hsaco) {
+  std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
+  g_hsacoCache.cache.resize(g_hsacoCache.cache.size() + 1);
+  g_hsacoCache.cache.back().ir = ir;
+  g_hsacoCache.cache.back().hash = hash;
+  g_hsacoCache.cache.back().gfx = gfx;
+  g_hsacoCache.cache.back().hsaco = hsaco;
+}
+
 // Emits the given module to HSA Code Object. target_machine is an initialized
 // TargetMachine for the AMDGPU target.
 StatusOr<std::vector<uint8>> EmitModuleToHsaco(
@@ -584,18 +647,29 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   std::string tempdir_name = tempdir_vector.front();
   VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
 
+  bool keep_tempfiles = false;
+  TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
+                                             /*default_val=*/false,
+                                             &keep_tempfiles));
   // Prepare filenames for all stages of compilation:
   // IR, binary ISA, and HSACO.
-  std::string ir_filename = absl::StrCat(module->getModuleIdentifier(), ".ll");
+  std::string random_number = std::to_string(tensorflow::random::New64());
+  std::string ir_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
   std::string ir_path = tensorflow::io::JoinPath(tempdir_name, ir_filename);
 
+  std::string ir_opt_filename =
+      absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
+  std::string ir_opt_path =
+      tensorflow::io::JoinPath(tempdir_name, ir_opt_filename);
+
   std::string isabin_filename =
-      absl::StrCat(module->getModuleIdentifier(), ".o");
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
   std::string isabin_path =
       tensorflow::io::JoinPath(tempdir_name, isabin_filename);
 
   std::string hsaco_filename =
-      absl::StrCat(module->getModuleIdentifier(), ".hsaco");
+      absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
   std::string hsaco_path =
       tensorflow::io::JoinPath(tempdir_name, hsaco_filename);
 
@@ -613,7 +687,7 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   std::string module_id = module->getModuleIdentifier();
   IrDumpingPassManager codegen_passes(
       ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
-                               "-amdgpu.dummy"),
+                               random_number + "-amdgpu.dummy"),
       "", false);
   codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
       llvm::Triple(module->getTargetTriple())));
@@ -627,6 +701,12 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   codegen_passes.run(*module);
   isabin_fs->flush();
 
+  if (keep_tempfiles) {
+    std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
+        new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::F_None));
+    module->print(*ir_fs, nullptr);
+    ir_fs->flush();
+  }
   // Locate lld.
   // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
   // ROCm-Device-Libs PR.
@@ -652,9 +732,9 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   int lld_result =
       llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
                                 llvm::None, {}, 0, 0, &error_message);
-
   if (lld_result) {
-    return xla::InternalError("ld.lld execute fail: %s", error_message);
+    return xla::InternalError("ld.lld execute fail: %s, error code %d",
+                              error_message, lld_result);
   }
 
   // Read HSACO.
@@ -664,6 +744,12 @@ StatusOr<std::vector<uint8>> EmitModuleToHsaco(
   std::vector<uint8> hsaco(hsaco_file_size);
   hsaco_file.seekg(0, std::ios::beg);
   hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  hsaco_file.close();
+  if (!keep_tempfiles) {
+    remove(ir_path.c_str());
+    remove(isabin_path.c_str());
+    remove(hsaco_path.c_str());
+  }
   return hsaco;
 }
 
@@ -728,6 +814,20 @@ StatusOr<std::vector<uint8>> CompileToHsaco(
 
   std::vector<uint8> hsaco;
   std::unique_ptr<llvm::TargetMachine> target_machine;
+  std::string str;
+  llvm::raw_string_ostream stream(str);
+  stream << *module;
+  // Delete the first two lines, since they usually vary even when the rest of
+  // the code is the same (but verify that they are what we expect).
+  if (str.size() >= 13 && str.substr(0, 13) == "; ModuleID = ") {
+    auto pos = str.find("\n");
+    if (pos != std::string::npos) str = str.substr(pos + 1);
+  }
+  if (str.size() >= 18 && str.substr(0, 18) == "source_filename = ") {
+    auto pos = str.find("\n");
+    if (pos != std::string::npos) str = str.substr(pos + 1);
+  }
+  str += hlo_module_config.compilation_cache_key();
   {
     tensorflow::profiler::TraceMe activity(
         [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
@@ -739,6 +839,21 @@ StatusOr<std::vector<uint8>> CompileToHsaco(
       return xla::InternalError(
           "Incompatible AMD GCN ISA version was specified.");
     }
+    uint64_t hash;
+    if (HsacoCache::Find(str, hash, *amdgpu_version, hsaco)) {
+      VLOG(1) << "HSACO cache hit";
+      return hsaco;
+    }
+    VLOG(1) << "HSACO cache miss";
+    bool dump_lls = false;
+    if (dump_lls) {
+      static int hsaco_count = 0;
+      std::string name = "/tmp/" + std::to_string(hsaco_count) + ".ll";
+      hsaco_count++;
+      std::ofstream ofs(name);
+      ofs << str;
+      ofs.close();
+    }
 
     llvm::Triple default_target_triple("amdgcn--amdhsa-amdgiz");
     // Construct LLVM TargetMachine for AMDGPU.
@@ -754,6 +869,7 @@ StatusOr<std::vector<uint8>> CompileToHsaco(
 
     // Lower optimized LLVM module to HSA code object.
     TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
+    HsacoCache::Add(str, hash, *amdgpu_version, hsaco);
   }
   return hsaco;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a2bddd2d0d7..809b277317f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -458,6 +458,35 @@ xla_test(
     ],
 )
 
+tf_cc_test(
+    name = "sorting_test",
+    srcs = [
+        "sorting_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + [
+        "no_rocm",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_binary(
     name = "hlo_to_llvm_ir",
     srcs = ["hlo_to_llvm_ir.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
index 272c9a25769..4d29a8df116 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting.hlo
@@ -8,162 +8,162 @@ compare {
   ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
 }
 
-// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC1:%.*]])
+// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]])
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1]], i64 0
-// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
 // CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
 // CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
-// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
 // CHECK:       sort.in_bounds-after:
 // CHECK-NEXT:    ret void
 // CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 1
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP8]], 3
-// CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-// CHECK-NEXT:    br i1 [[TMP11]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
+// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
 // CHECK:       smaller_comparison_index-after:
 // CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
 // CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    call void @compare(float* [[TMP12]], float* [[TMP13]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP14]], 0
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
 // CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
 // CHECK:       is_smaller_than-after:
 // CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
 // CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP13]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    store float [[TMP16]], float* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
 // CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 
-// CHECK: define internal void @compare(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
+// CHECK: define internal void @region_0_4(float* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[LT_TYPED:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P_0_LHS_TYPED]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[P_0_RHS_TYPED]], align 4
+// CHECK-NEXT:    [[COMPARE_3_TYPED:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARG_0_1_TYPED:%.*]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARG_1_2_TYPED:%.*]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT:    store i8 [[TMP3]], i8* [[LT_TYPED]], align 1
-// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[LT_TYPED]], align 1
-// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG]], align 1
+// CHECK-NEXT:    store i8 [[TMP3]], i8* [[COMPARE_3_TYPED]], align 1
+// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_3_TYPED]], align 1
+// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1
 // CHECK-NEXT:    ret void
 
-// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC1:%.*]]) {
+// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1]], i64 0
-// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
 // CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
 // CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
-// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
 // CHECK:       sort.in_bounds-after:
 // CHECK-NEXT:    ret void
 // CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[TMP4]], [[TMP7]]
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP8]], [[TMP9]]
-// CHECK-NEXT:    br i1 [[TMP10]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP8]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[TMP8]], [[TMP11]]
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+// CHECK-NEXT:    br i1 [[TMP14]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
 // CHECK:       smaller_comparison_index-after:
 // CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
 // CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
-// CHECK-NEXT:    call void @compare(float* [[TMP11]], float* [[TMP12]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP13]], 0
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]]
+// CHECK-NEXT:    call void @region_0_4(float* [[TMP15]], float* [[TMP16]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP17]], 0
 // CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
 // CHECK:       is_smaller_than-after:
 // CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
 // CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP11]], align 4
-// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
-// CHECK-NEXT:    store float [[TMP14]], float* [[TMP16]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP15]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP8]]
+// CHECK-NEXT:    store float [[TMP18]], float* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
 // CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 
-// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC1:%.*]]) {
+// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]]) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
-// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP5]] to i64
+// CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP6]], [[THREAD_ID]]
 // CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
 // CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
-// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = urem i64 [[TMP7]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP10]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
 // CHECK:       sort.in_bounds-after:
 // CHECK-NEXT:    ret void
 // CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 1
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP8]], 3
-// CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-// CHECK-NEXT:    br i1 [[TMP11]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
+// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
 // CHECK:       smaller_comparison_index-after:
 // CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
 // CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    call void @compare(float* [[TMP12]], float* [[TMP13]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP14]], 0
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    call void @region_0_4(float* [[TMP16]], float* [[TMP17]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP18]], 0
 // CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
 // CHECK:       is_smaller_than-after:
 // CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
 // CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP13]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store float [[TMP15]], float* [[TMP17]], align 4
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    store float [[TMP16]], float* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP11]]
+// CHECK-NEXT:    store float [[TMP19]], float* [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP1]], i64 0, i64 [[TMP9]], i64 [[TMP12]]
+// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
 // CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 ENTRY main {
   x = f32[2, 3] parameter(0)
@@ -182,210 +182,198 @@ compare {
   ROOT lt = pred[] compare(p.1.lhs, p.1.rhs), direction=LT
 }
 
-// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC2:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC3:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK: define void @sort(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x i8*]*
-// CHECK-NEXT:    [[SORT_RAW1:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED2:%.*]] = bitcast i8* [[SORT_RAW1]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[SORT_RAW3:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED4:%.*]] = bitcast i8* [[SORT_RAW3]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC2]], i64 0
-// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[Y_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC3]], i64 0
-// CHECK-NEXT:    [[Y_TYPED:%.*]] = bitcast i8* [[Y_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
 // CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
 // CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
-// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
 // CHECK:       sort.in_bounds-after:
 // CHECK-NEXT:    ret void
 // CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 2
-// CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 1
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[TMP8]], 3
-// CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-// CHECK-NEXT:    br i1 [[TMP11]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 1
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    [[TMP16:%.*]] = icmp slt i64 [[TMP14]], 3
+// CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+// CHECK-NEXT:    br i1 [[TMP17]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
 // CHECK:       smaller_comparison_index-after:
 // CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
 // CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    call void @compare(i32* [[TMP12]], i32* [[TMP13]], float* [[TMP14]], float* [[TMP15]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP16]], 0
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    call void @region_0_6(i32* [[TMP18]], i32* [[TMP19]], float* [[TMP20]], float* [[TMP21]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP22:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP22]], 0
 // CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
 // CHECK:       is_smaller_than-after:
 // CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
 // CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP13]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP19]], align 4
-// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP20]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP15]], align 4
-// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store float [[TMP21]], float* [[TMP23]], align 4
-// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP8]]
-// CHECK-NEXT:    store float [[TMP22]], float* [[TMP24]], align 4
+// CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    store i32 [[TMP24]], i32* [[TMP26]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP21]], align 4
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store float [[TMP27]], float* [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP14]]
+// CHECK-NEXT:    store float [[TMP28]], float* [[TMP30]], align 4
 // CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 
-// CHECK: define internal void @compare(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
+// CHECK: define internal void @region_0_6(i32* dereferenceable(4) [[P_0_LHS_TYPED:%.*]], i32* dereferenceable(4) [[P_0_RHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_LHS_TYPED:%.*]], float* dereferenceable(4) [[P_1_RHS_TYPED:%.*]], i8* dereferenceable(1) [[OUTPUT_ARG:%.*]])
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[LT_TYPED:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P_1_LHS_TYPED]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[P_1_RHS_TYPED]], align 4
+// CHECK-NEXT:    [[COMPARE_5_TYPED:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARG_2_3_TYPED:%.*]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARG_3_4_TYPED:%.*]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt float [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i8
-// CHECK-NEXT:    store i8 [[TMP3]], i8* [[LT_TYPED]], align 1
-// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[LT_TYPED]], align 1
-// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG]], align 1
+// CHECK-NEXT:    store i8 [[TMP3]], i8* [[COMPARE_5_TYPED]], align 1
+// CHECK-NEXT:    [[LOAD_RET_VALUE:%.*]] = load i8, i8* [[COMPARE_5_TYPED]], align 1
+// CHECK-NEXT:    store i8 [[LOAD_RET_VALUE]], i8* [[OUTPUT_ARG:%.*]], align 1
 // CHECK-NEXT:    ret void
 
-// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC2:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC3:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK: define void @sort__1(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x i8*]*
-// CHECK-NEXT:    [[SORT_RAW1:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED2:%.*]] = bitcast i8* [[SORT_RAW1]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[SORT_RAW3:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED4:%.*]] = bitcast i8* [[SORT_RAW3]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC2:%.*]], i64 0
-// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[Y_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC3:%.*]], i64 0
-// CHECK-NEXT:    [[Y_TYPED:%.*]] = bitcast i8* [[Y_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
 // CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
 // CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
-// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
 // CHECK:       sort.in_bounds-after:
 // CHECK-NEXT:    ret void
 // CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[TMP4]], [[TMP7]]
-// CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP7]], 3
-// CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP8]], [[TMP9]]
-// CHECK-NEXT:    br i1 [[TMP10]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP10]], 3
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP10]], [[TMP13]]
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i64 [[TMP13]], 3
+// CHECK-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+// CHECK-NEXT:    br i1 [[TMP16]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
 // CHECK:       smaller_comparison_index-after:
 // CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
 // CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
-// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
-// CHECK-NEXT:    call void @compare(i32* [[TMP11]], i32* [[TMP12]], float* [[TMP13]], float* [[TMP14]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP15]], 0
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    call void @region_0_6(i32* [[TMP17]], i32* [[TMP18]], float* [[TMP19]], float* [[TMP20]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP21:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP21]], 0
 // CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
 // CHECK:       is_smaller_than-after:
 // CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
 // CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP11]], align 4
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP12]], align 4
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
-// CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP18]], align 4
-// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP19]], align 4
-// CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP13]], align 4
-// CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP4]]
-// CHECK-NEXT:    store float [[TMP20]], float* [[TMP22]], align 4
-// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP7]]
-// CHECK-NEXT:    store float [[TMP21]], float* [[TMP23]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP24]], align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store i32 [[TMP23]], i32* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP20]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP10]]
+// CHECK-NEXT:    store float [[TMP26]], float* [[TMP28]], align 4
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP13]]
+// CHECK-NEXT:    store float [[TMP27]], float* [[TMP29]], align 4
 // CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 
-// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC2:%.*]], i8* noalias align 16 dereferenceable(24) [[ALLOC3:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
+// CHECK: define void @sort__2(i8* noalias align 64 dereferenceable(24) [[ALLOC0:%.*]], i8* noalias align 64 dereferenceable(24) [[ALLOC1:%.*]], i8* noalias align 64 dereferenceable(16) [[ALLOC4:%.*]])
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[COMPARE_RETURN_BUFFER:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    [[SORT_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED:%.*]] = bitcast i8* [[SORT_RAW]] to [2 x i8*]*
-// CHECK-NEXT:    [[SORT_RAW1:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED2:%.*]] = bitcast i8* [[SORT_RAW1]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[SORT_RAW3:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
-// CHECK-NEXT:    [[SORT_TYPED4:%.*]] = bitcast i8* [[SORT_RAW3]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[X_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC2:%.*]], i64 0
-// CHECK-NEXT:    [[X_TYPED:%.*]] = bitcast i8* [[X_RAW]] to [2 x [3 x i32]]*
-// CHECK-NEXT:    [[Y_RAW:%.*]] = getelementptr inbounds i8, i8* [[ALLOC3:%.*]], i64 0
-// CHECK-NEXT:    [[Y_TYPED:%.*]] = bitcast i8* [[Y_RAW]] to [2 x [3 x float]]*
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
-// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
-// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
-// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP2]], [[THREAD_ID]]
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[ALLOC0:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to [2 x [3 x i32]]*
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[ALLOC1:%.*]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to [2 x [3 x float]]*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[ALLOC4:%.*]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
+// CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !6
+// CHECK-NEXT:    [[BLOCK_ID:%.*]] = zext i32 [[TMP6]] to i64
+// CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
+// CHECK-NEXT:    [[THREAD_ID:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[BLOCK_ID]], 4
+// CHECK-NEXT:    [[LINEAR_INDEX:%.*]] = add nuw nsw i64 [[TMP8]], [[THREAD_ID]]
 // CHECK-NEXT:    [[LINEAR_INDEX_IN_RANGE:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
 // CHECK-NEXT:    call void @llvm.assume(i1 [[LINEAR_INDEX_IN_RANGE]])
-// CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = urem i64 [[TMP3]], 2
-// CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
-// CHECK-NEXT:    br i1 [[TMP6]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 [[LINEAR_INDEX]], 1
+// CHECK-NEXT:    [[TMP10:%.*]] = urem i64 [[TMP9]], 2
+// CHECK-NEXT:    [[TMP11:%.*]] = udiv i64 [[LINEAR_INDEX]], 2
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp ult i64 [[LINEAR_INDEX]], 4
+// CHECK-NEXT:    br i1 [[TMP12]], label [[SORT_IN_BOUNDS_TRUE:%.*]], label [[SORT_IN_BOUNDS_AFTER:%.*]]
 // CHECK:       sort.in_bounds-after:
-// CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x [3 x i32]]* [[SORT_TYPED2]] to i8*
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[SORT_TYPED]], i64 0, i64 0
-// CHECK-NEXT:    store i8* [[TMP7]], i8** [[TMP8]], align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = bitcast [2 x [3 x float]]* [[SORT_TYPED4]] to i8*
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[SORT_TYPED]], i64 0, i64 1
-// CHECK-NEXT:    store i8* [[TMP9]], i8** [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x [3 x i32]]* [[TMP1]] to i8*
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
+// CHECK-NEXT:    store i8* [[TMP13]], i8** [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast [2 x [3 x float]]* [[TMP3]] to i8*
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
+// CHECK-NEXT:    store i8* [[TMP15]], i8** [[TMP16]], align 8
 // CHECK-NEXT:    ret void
 // CHECK:       sort.in_bounds-true:
-// CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP4]], 2
-// CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 1
-// CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i64 [[TMP11]], [[TMP12]]
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i64 [[TMP12]], 3
-// CHECK-NEXT:    [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-// CHECK-NEXT:    br i1 [[TMP15]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
+// CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP10]], 2
+// CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP17]], 1
+// CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i64 [[TMP17]], [[TMP18]]
+// CHECK-NEXT:    [[TMP20:%.*]] = icmp slt i64 [[TMP18]], 3
+// CHECK-NEXT:    [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+// CHECK-NEXT:    br i1 [[TMP21]], label [[SMALLER_COMPARISON_INDEX_TRUE:%.*]], label [[SMALLER_COMPARISON_INDEX_AFTER:%.*]]
 // CHECK:       smaller_comparison_index-after:
 // CHECK-NEXT:    br label [[SORT_IN_BOUNDS_AFTER]]
 // CHECK:       smaller_comparison_index-true:
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
-// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
-// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
-// CHECK-NEXT:    call void @compare(i32* [[TMP16]], i32* [[TMP17]], float* [[TMP18]], float* [[TMP19]], i8* [[COMPARE_RETURN_BUFFER]])
-// CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
-// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP20]], 0
+// CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    call void @region_0_6(i32* [[TMP22]], i32* [[TMP23]], float* [[TMP24]], float* [[TMP25]], i8* [[COMPARE_RETURN_BUFFER]])
+// CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[COMPARE_RETURN_BUFFER]], align 1
+// CHECK-NEXT:    [[BOOLEAN_PREDICATE:%.*]] = icmp ne i8 [[TMP26]], 0
 // CHECK-NEXT:    br i1 [[BOOLEAN_PREDICATE]], label [[IS_SMALLER_THAN_TRUE:%.*]], label [[IS_SMALLER_THAN_AFTER:%.*]]
 // CHECK:       is_smaller_than-after:
 // CHECK-NEXT:    br label [[SMALLER_COMPARISON_INDEX_AFTER]]
 // CHECK:       is_smaller_than-true:
-// CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP16]], align 4
-// CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP17]], align 4
-// CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
-// CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP23]], align 4
-// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[SORT_TYPED2]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
-// CHECK-NEXT:    store i32 [[TMP22]], i32* [[TMP24]], align 4
-// CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP18]], align 4
-// CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP19]], align 4
-// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP11]]
-// CHECK-NEXT:    store float [[TMP25]], float* [[TMP27]], align 4
-// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[SORT_TYPED4]], i64 0, i64 [[TMP5]], i64 [[TMP12]]
-// CHECK-NEXT:    store float [[TMP26]], float* [[TMP28]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP22]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP23]], align 4
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    store i32 [[TMP27]], i32* [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [2 x [3 x i32]], [2 x [3 x i32]]* [[TMP1]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    store i32 [[TMP28]], i32* [[TMP30]], align 4
+// CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP24]], align 4
+// CHECK-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP25]], align 4
+// CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP17]]
+// CHECK-NEXT:    store float [[TMP31]], float* [[TMP33]], align 4
+// CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x [3 x float]], [2 x [3 x float]]* [[TMP3]], i64 0, i64 [[TMP11]], i64 [[TMP18]]
+// CHECK-NEXT:    store float [[TMP32]], float* [[TMP34]], align 4
 // CHECK-NEXT:    br label [[IS_SMALLER_THAN_AFTER]]
 ENTRY main {
   x = s32[2, 3] parameter(0)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc b/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
new file mode 100644
index 00000000000..197a0c6cfeb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+class SortingTest : public GpuCodegenTest {
+ protected:
+  HloModuleConfig ConfigWithoutLayoutAssignment() {
+    HloModuleConfig config;
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    // Disable layout_assignment to use the preassigned layouts.
+    debug_options.add_xla_disable_hlo_passes("layout-assignment");
+    config.set_debug_options(debug_options);
+    return config;
+  }
+};
+
+TEST_F(SortingTest, Regression1) {
+  const char* hlo_text = R"(
+HloModule TestModule
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY TestComputation {
+  x = f32[3, 2]{1, 0} parameter(0)
+  x.copy = f32[3, 2]{0, 1} copy(x)
+  ROOT sort = f32[3, 2]{0, 1} sort(x.copy), dimensions={1}, to_apply=compare
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 10751752571..2e2b668eba7 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -55,9 +56,10 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForModule(
   // rather than summing each computation, since it gives us a better lower
   // bound, by minimizing the liveness of sub-computations.
   TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(), *module,
-                         schedule, *alias_analysis, size_function));
+      HeapSimulator::Result<HloValue> result,
+      HeapSimulator::Run(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>(), *module,
+          schedule, *alias_analysis, size_function));
   return result.heap_size;
 }
 
@@ -69,10 +71,11 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForComputation(
     const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation) {
   TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
-                         computation, sequence, alias_analysis, size_function,
-                         HeapSimulator::Options(), memory_by_computation));
+      HeapSimulator::Result<HloValue> result,
+      HeapSimulator::Run(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>(), computation,
+          sequence, alias_analysis, size_function, HeapSimulator::Options(),
+          memory_by_computation));
   return result.heap_size;
 }
 
@@ -82,16 +85,17 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForComputation(
     const LogicalBuffer::SizeFunction& size_function,
     const HloSchedule* schedule) {
   TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
-                         computation, sequence, alias_analysis, size_function,
-                         schedule, HeapSimulator::Options()));
+      HeapSimulator::Result<HloValue> result,
+      HeapSimulator::Run(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>(), computation,
+          sequence, alias_analysis, size_function, schedule,
+          HeapSimulator::Options()));
   return result.heap_size;
 }
 
 /*static*/
-StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm, const HloModule& module,
     const HloSchedule& schedule, const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_fn, const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options, &schedule);
@@ -108,8 +112,9 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
 }
 
 /*static*/
-StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
+StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+    const HloComputation& computation,
     const HloInstructionSequence& instruction_sequence,
     const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_fn, const Options& options,
@@ -128,8 +133,9 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
 }
 
 /*static*/
-StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
+StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+    const HloComputation& computation,
     const HloInstructionSequence& instruction_sequence,
     const HloAliasAnalysis& alias_analysis,
     const BufferValue::SizeFunction& size_fn, const HloSchedule* schedule,
@@ -326,12 +332,13 @@ Status HeapSimulator::RunComputation(
 }
 
 HeapSimulator::HeapSimulator(
-    std::unique_ptr<HeapAlgorithm> algorithm,
+    std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
     const BufferValue::SizeFunction& size_fn, const Options& options,
     const HloSchedule* schedule,
     const absl::flat_hash_map<const HloComputation*, int64>*
         memory_by_computation)
-    : no_fragmentation_stats_(absl::make_unique<NoFragmentationStatsHeap>()),
+    : no_fragmentation_stats_(
+          absl::make_unique<NoFragmentationStatsHeap<HloValue>>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
       options_(options),
@@ -396,8 +403,8 @@ void HeapSimulator::ShareBuffer(const HloValue* buffer, const HloValue* shared,
                  shared);
 }
 
-HeapSimulator::Result HeapSimulator::Finish() {
-  Result result = algorithm_->Finish();
+HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
+  Result<HloValue> result = algorithm_->Finish();
 
   // Post-process the result to add chunks for shared buffers.  An empty chunk
   // map means that either no buffers were allocated, or the heap was only
@@ -411,7 +418,7 @@ HeapSimulator::Result HeapSimulator::Finish() {
   }
 
   // Fragmentation is the difference between the actual and ideal sizes.
-  const Result no_frag_result = no_fragmentation_stats_->Finish();
+  const Result<HloValue> no_frag_result = no_fragmentation_stats_->Finish();
   result.fragmentation_size = result.heap_size - no_frag_result.heap_size;
 
   // Copy the debug trace we collected to the final result.
@@ -437,14 +444,17 @@ void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
   }
 }
 
-void NoFragmentationStatsHeap::Alloc(const HloValue* buffer, int64 size) {
+template <typename BufferType>
+void NoFragmentationStatsHeap<BufferType>::Alloc(const BufferType* buffer,
+                                                 int64 size) {
   current_heap_size_ += size;
   if (current_heap_size_ > max_heap_size_) {
     max_heap_size_ = current_heap_size_;
   }
 }
 
-void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
+template <typename BufferType>
+void NoFragmentationStatsHeap<BufferType>::AccountForSubcomputationMemory(
     const HloInstruction* instruction, int64 alloc_size_by_instruction,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
@@ -472,11 +482,15 @@ void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
       std::max(max_heap_size_, current_heap_size_ + max_subcomputation_bytes);
 }
 
-void NoFragmentationStatsHeap::Free(const HloValue* buffer, int64 size) {
+template <typename BufferType>
+void NoFragmentationStatsHeap<BufferType>::Free(const BufferType* buffer,
+                                                int64 size) {
   current_heap_size_ -= size;
 }
 
-HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
+template <typename BufferType>
+HeapSimulator::Result<BufferType>
+NoFragmentationStatsHeap<BufferType>::Finish() {
   // The result.chunk_map is empty, since we only collect stats, and don't
   // actually compute chunk assignments.
   Result result;
@@ -484,7 +498,8 @@ HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
   return result;
 }
 
-GlobalDecreasingSizeBestFitHeap::GlobalDecreasingSizeBestFitHeap(
+template <typename BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::GlobalDecreasingSizeBestFitHeap(
     int64 alignment, Type type)
     : alignment_(alignment) {
   if (type == kTemporal) {
@@ -495,8 +510,10 @@ GlobalDecreasingSizeBestFitHeap::GlobalDecreasingSizeBestFitHeap(
   }
 }
 
-GlobalDecreasingSizeBestFitHeap::BufferIntervalCompare
-GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare() const {
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<BufferType>::BufferIntervalCompare
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetTemporalBufferIntervalCompare()
+    const {
   return [&](const BufferInterval& x, const BufferInterval& y) {
     int64 x_end = x.end;
     for (auto colocation : GetTransitiveColocations(x)) {
@@ -515,12 +532,14 @@ GlobalDecreasingSizeBestFitHeap::GetTemporalBufferIntervalCompare() const {
     if (x.size != y.size) {
       return x.size > y.size;
     }
-    return x.buffer->id() < y.buffer->id();
+    return *x.buffer < *y.buffer;
   };
 }
 
-/*static*/ GlobalDecreasingSizeBestFitHeap::BufferIntervalCompare
-GlobalDecreasingSizeBestFitHeap::GetSpatialBufferIntervalCompare() {
+template <typename BufferType>
+/*static*/ typename GlobalDecreasingSizeBestFitHeap<
+    BufferType>::BufferIntervalCompare
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetSpatialBufferIntervalCompare() {
   return [&](const BufferInterval& x, const BufferInterval& y) {
     if (x.size != y.size) {
       return x.size > y.size;
@@ -528,12 +547,13 @@ GlobalDecreasingSizeBestFitHeap::GetSpatialBufferIntervalCompare() {
     if (x.end - x.start != y.end - y.start) {
       return x.end - x.start > y.end - y.start;
     }
-    return x.buffer->id() < y.buffer->id();
+    return *x.buffer < *y.buffer;
   };
 }
 
-void GlobalDecreasingSizeBestFitHeap::Alloc(const HloValue* buffer,
-                                            int64 size) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::Alloc(
+    const BufferType* buffer, int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     result_.chunk_map.emplace(buffer, Chunk{0, 0});
@@ -546,9 +566,9 @@ void GlobalDecreasingSizeBestFitHeap::Alloc(const HloValue* buffer,
   ++current_time_;
 }
 
-void GlobalDecreasingSizeBestFitHeap::ShareWith(const HloValue* buffer,
-                                                const HloValue* share_with,
-                                                int64 size) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::ShareWith(
+    const BufferType* buffer, const BufferType* share_with, int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     result_.chunk_map.emplace(buffer, Chunk{0, 0});
@@ -562,15 +582,16 @@ void GlobalDecreasingSizeBestFitHeap::ShareWith(const HloValue* buffer,
   ++current_time_;
 }
 
-absl::flat_hash_set<const HloValue*>
-GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations(
+template <typename BufferType>
+absl::flat_hash_set<const BufferType*>
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetTransitiveColocations(
     const BufferInterval& interval) const {
-  absl::flat_hash_set<const HloValue*> result;
+  absl::flat_hash_set<const BufferType*> result;
   std::vector<const BufferInterval*> worklist = {&interval};
   while (!worklist.empty()) {
     const BufferInterval* item = worklist.back();
     worklist.pop_back();
-    for (const HloValue* buffer_colocated : item->colocations) {
+    for (const BufferType* buffer_colocated : item->colocations) {
       result.insert(buffer_colocated);
       worklist.push_back(&buffer_intervals_.at(buffer_colocated));
     }
@@ -579,7 +600,9 @@ GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations(
   return result;
 }
 
-void GlobalDecreasingSizeBestFitHeap::Free(const HloValue* buffer, int64 size) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::Free(const BufferType* buffer,
+                                                       int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     return;
@@ -785,7 +808,9 @@ std::vector<Chunk> BufferIntervalTree::ChunksOverlappingInTime(
   return result;
 }
 
-HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
+template <typename BufferType>
+HeapSimulator::Result<BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
 
@@ -803,8 +828,10 @@ HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
   return result_;
 }
 
-std::vector<GlobalDecreasingSizeBestFitHeap::BufferInterval>
-GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const {
+template <typename BufferType>
+std::vector<
+    typename GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval>
+GlobalDecreasingSizeBestFitHeap<BufferType>::GetSortedBufferIntervals() const {
   std::vector<BufferInterval> sorted_buffer_intervals;
   for (auto& entry : buffer_intervals_) {
     sorted_buffer_intervals.push_back(entry.second);
@@ -814,8 +841,9 @@ GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const {
   return sorted_buffer_intervals;
 }
 
-GlobalDecreasingSizeBestFitHeap::ChunkCandidate
-GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<BufferType>::ChunkCandidate
+GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
     const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
     int64 preferred_offset) const {
   VLOG(1) << "Finding chunks for buffer: "
@@ -912,9 +940,12 @@ GlobalDecreasingSizeBestFitHeap::FindChunkCandidate(
   return chunk_candidate;
 }
 
-void GlobalDecreasingSizeBestFitHeap::CommitChunk(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
-    GlobalDecreasingSizeBestFitHeap::ChunkCandidate chunk_candidate) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::CommitChunk(
+    const GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval&
+        buffer_interval,
+    GlobalDecreasingSizeBestFitHeap<BufferType>::ChunkCandidate
+        chunk_candidate) {
   // Update the maximum heap size according to the one determined by the chunk
   // candidate.
   result_.heap_size = chunk_candidate.heap_size;
@@ -930,13 +961,16 @@ void GlobalDecreasingSizeBestFitHeap::CommitChunk(
   AddToChunkMap(buffer_interval.buffer, chunk_candidate.chunk);
 }
 
-void GlobalDecreasingSizeBestFitHeap::AddToChunkMap(const HloValue* buffer,
-                                                    Chunk chunk) {
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
+    const BufferType* buffer, Chunk chunk) {
   const auto emplace_result = result_.chunk_map.emplace(buffer, chunk);
   DCHECK(emplace_result.second);
 }
 
-HeapSimulator::Result ChooseBestHeapAlgorithm::Finish() {
+template <typename BufferType>
+HeapSimulator::Result<BufferType>
+ChooseBestHeapAlgorithm<BufferType>::Finish() {
   DCHECK(!algorithms_.empty());
   std::vector<Result> results(algorithms_.size());
   int64 min_size = INT64_MAX;
@@ -953,4 +987,9 @@ HeapSimulator::Result ChooseBestHeapAlgorithm::Finish() {
   return results[min_size_index];
 }
 
+template class GlobalDecreasingSizeBestFitHeap<HloValue>;
+template class GlobalDecreasingSizeBestFitHeap<
+    MemorySpaceAssignmentRepacker::AllocationBlock>;
+template class ChooseBestHeapAlgorithm<HloValue>;
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index d3b781ded0c..b47ff685139 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -40,7 +40,9 @@ limitations under the License.
 namespace xla {
 
 // Forward declare classes defined below.
+template <typename BufferType>
 class HeapAlgorithm;
+template <typename BufferType>
 class NoFragmentationStatsHeap;
 
 // HeapSimulator assigns buffer offsets by running a simulation of a regular
@@ -66,9 +68,10 @@ class HeapSimulator {
   };
 
   // Result represents the result of the heap simulation.
+  template <typename BufferType>
   struct Result {
     // The assignment of buffers to chunks.
-    absl::flat_hash_map<const HloValue*, Chunk> chunk_map;
+    absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
@@ -128,19 +131,19 @@ class HeapSimulator {
   // to running on a per-computation basis, since we can re-use buffer space for
   // called sub-computations.
   //
-  static StatusOr<Result> Run(std::unique_ptr<HeapAlgorithm> algorithm,
-                              const HloModule& module,
-                              const HloSchedule& schedule,
-                              const HloAliasAnalysis& alias_analysis,
-                              const BufferValue::SizeFunction& size_fn,
-                              const Options& options = Options());
+  static StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
+      const HloModule& module, const HloSchedule& schedule,
+      const HloAliasAnalysis& alias_analysis,
+      const BufferValue::SizeFunction& size_fn,
+      const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
   // must contain a topologically-consistent total ordering of all instructions
   // in the computation. The result is invalid if instructions are not run in
   // exactly this sequence.
-  static StatusOr<Result> Run(
-      std::unique_ptr<HeapAlgorithm> algorithm,
+  static StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
       const HloComputation& computation,
       const HloInstructionSequence& instruction_sequence,
       const HloAliasAnalysis& alias_analysis,
@@ -151,8 +154,8 @@ class HeapSimulator {
 
   // Same as above, but runs on with a schedule that covers all nested
   // computations.
-  static StatusOr<Result> Run(
-      std::unique_ptr<HeapAlgorithm> algorithm,
+  static StatusOr<Result<HloValue>> Run(
+      std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
       const HloComputation& computation,
       const HloInstructionSequence& instruction_sequence,
       const HloAliasAnalysis& alias_analysis,
@@ -163,7 +166,7 @@ class HeapSimulator {
   // If 'schedule' is non-null, it is used to find kCall and kWhile
   // sub-computations, and the heap simulation for those sub-computations will
   // be run recursively. I.e. the simulation is run over the whole module.
-  HeapSimulator(std::unique_ptr<HeapAlgorithm> algorithm,
+  HeapSimulator(std::unique_ptr<HeapAlgorithm<HloValue>> algorithm,
                 const BufferValue::SizeFunction& size_fn,
                 const Options& options, const HloSchedule* schedule = nullptr,
                 const absl::flat_hash_map<const HloComputation*, int64>*
@@ -187,7 +190,7 @@ class HeapSimulator {
   //  Two buffers belong to the same shared group.
   //  Eight of the buffer has no shared group assigned.
   bool InSameSharedGroup(const HloValue* left, const HloValue* right);
-  Result Finish();
+  Result<HloValue> Finish();
 
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
                       const HloValue* buffer, const HloInstruction* instruction,
@@ -196,8 +199,9 @@ class HeapSimulator {
   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
   // in which case we are calculating the same allocs/frees twice in the
   // simulation.
-  const std::unique_ptr<NoFragmentationStatsHeap> no_fragmentation_stats_;
-  const std::unique_ptr<HeapAlgorithm> algorithm_;
+  const std::unique_ptr<NoFragmentationStatsHeap<HloValue>>
+      no_fragmentation_stats_;
+  const std::unique_ptr<HeapAlgorithm<HloValue>> algorithm_;
   const BufferValue::SizeFunction size_fn_;
   const Options options_;
   // schedule_ is set by buffer assignment, and memory_by_computation_ is
@@ -220,15 +224,16 @@ class HeapSimulator {
 // offsets to buffers.  A sequence of Alloc / Free calls will be made, with the
 // same semantics as a regular memory heap.  Finish will be called at the end to
 // collect the simulation results.
+template <typename BufferType>
 class HeapAlgorithm {
  public:
   using Chunk = HeapSimulator::Chunk;
-  using Result = HeapSimulator::Result;
+  using Result = HeapSimulator::Result<BufferType>;
 
   virtual ~HeapAlgorithm() = default;
 
   // Alloc allocates a buffer of 'size' bytes.
-  virtual void Alloc(const HloValue* buffer, int64 size) = 0;
+  virtual void Alloc(const BufferType* buffer, int64 size) = 0;
 
   // Takes memory usage of subcomputations into account when calculating the
   // memory usage of a computation. Currently, we don't handle buffer aliasing
@@ -247,7 +252,7 @@ class HeapAlgorithm {
           memory_by_computation) {}
 
   // Free de-allocates a previously allocated buffer.
-  virtual void Free(const HloValue* buffer, int64 size) = 0;
+  virtual void Free(const BufferType* buffer, int64 size) = 0;
 
   // Indicates that a buffer has to be collocated with another buffer. In
   // addition to Alloc and Free, the heap simulator exposes a concept of buffer
@@ -255,7 +260,7 @@ class HeapAlgorithm {
   // the buffer, it associates the buffer with a previously allocated (or
   // shared) buffer.  Each group of mutually-shared buffers points to a single
   // SharedGroup instance, which is a shared control block.
-  virtual void ShareWith(const HloValue* buffer, const HloValue* share_with,
+  virtual void ShareWith(const BufferType* buffer, const BufferType* share_with,
                          int64 size) {
     Alloc(buffer, size);
   }
@@ -269,19 +274,22 @@ class HeapAlgorithm {
 // this is the absolute minimum size for a given instruction sequence.  The
 // result.chunk_map returned in Finish is always empty, since we only collect
 // stats, and don't actually compute chunk assignments.
-class NoFragmentationStatsHeap : public HeapAlgorithm {
+template <typename BufferType>
+class NoFragmentationStatsHeap : public HeapAlgorithm<BufferType> {
  public:
+  using Result = HeapSimulator::Result<BufferType>;
+
   NoFragmentationStatsHeap() = default;
   ~NoFragmentationStatsHeap() override = default;
 
-  void Alloc(const HloValue* buffer, int64 size) override;
+  void Alloc(const BufferType* buffer, int64 size) override;
 
   void AccountForSubcomputationMemory(
       const HloInstruction* instruction, int64 alloc_size_by_instruction,
       const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) override;
 
-  void Free(const HloValue* buffer, int64 size) override;
+  void Free(const BufferType* buffer, int64 size) override;
 
   Result Finish() override;
 
@@ -336,8 +344,12 @@ class BufferIntervalTree {
 // alloc/free time. It internally tracks the allocated buffers and their live
 // intervals; when allocating a buffer, it finds the best-fit free chunk during
 // its live interval.
-class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
+template <typename BufferType>
+class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
  public:
+  using Result = HeapSimulator::Result<BufferType>;
+  using Chunk = HeapSimulator::Chunk;
+
   enum Type {
     kSpatial = 0,
     kTemporal,
@@ -345,7 +357,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
 
   // BufferInterval stores a buffer's size and time interval.
   struct BufferInterval {
-    const HloValue* buffer;
+    const BufferType* buffer;
     int64 size;
     // Alloc time of the buffer.
     int64 start;
@@ -353,7 +365,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
     int64 end;
 
     // Colocation buffers that need to be collocated with this one.
-    std::vector<const HloValue*> colocations;
+    std::vector<const BufferType*> colocations;
 
     // True if this buffer needs an allocation. False if it is collocated with
     // other buffer.
@@ -368,10 +380,10 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
                                            Type type = kSpatial);
   ~GlobalDecreasingSizeBestFitHeap() override {}
 
-  void Alloc(const HloValue* buffer, int64 size) override;
-  void Free(const HloValue* buffer, int64 size) override;
+  void Alloc(const BufferType* buffer, int64 size) override;
+  void Free(const BufferType* buffer, int64 size) override;
 
-  void ShareWith(const HloValue* buffer, const HloValue* share_with,
+  void ShareWith(const BufferType* buffer, const BufferType* share_with,
                  int64 size) override;
 
   Result Finish() override;
@@ -404,7 +416,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   void CommitChunk(const BufferInterval& buffer_interval,
                    ChunkCandidate chunk_candidate);
   // Adds the buffer and the chunk to the result chunk map.
-  virtual void AddToChunkMap(const HloValue* buffer, Chunk chunk);
+  virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
 
   // Return a BufferIntervalCompare function that sorts by live ranges.  A live
   // range is defined by the range between the start of the first buffer and the
@@ -413,7 +425,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   // contiguous.
   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
 
-  absl::flat_hash_map<const HloValue*, BufferInterval> buffer_intervals_;
+  absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
   Result result_;
   BufferIntervalCompare buffer_interval_compare_;
   BufferIntervalTree interval_tree_;
@@ -428,33 +440,37 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm {
   // Returns all transitive colocated buffers of this buffer interval. I.e., If
   // a buffer A is colocated with B and B is colocated with C, this function
   // returns all three of them.
-  absl::flat_hash_set<const HloValue*> GetTransitiveColocations(
+  absl::flat_hash_set<const BufferType*> GetTransitiveColocations(
       const BufferInterval& interval) const;
 };
 
 // A heap algorithm that chooses the best results from other algorithms added to
 // it.
-class ChooseBestHeapAlgorithm : public HeapAlgorithm {
+template <typename BufferType>
+class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
  public:
+  using Result = HeapSimulator::Result<BufferType>;
+
   ChooseBestHeapAlgorithm(
-      std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm>>> algorithms)
+      std::unique_ptr<std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>>>
+          algorithms)
       : algorithms_(std::move(*algorithms)) {}
   ~ChooseBestHeapAlgorithm() override {}
 
-  void Alloc(const HloValue* buffer, int64 size) override {
+  void Alloc(const BufferType* buffer, int64 size) override {
     for (auto& algorithm : algorithms_) {
       algorithm->Alloc(buffer, size);
     }
   }
 
-  void ShareWith(const HloValue* buffer, const HloValue* share_with,
+  void ShareWith(const BufferType* buffer, const BufferType* share_with,
                  int64 size) override {
     for (auto& algorithm : algorithms_) {
       algorithm->ShareWith(buffer, share_with, size);
     }
   }
 
-  void Free(const HloValue* buffer, int64 size) override {
+  void Free(const BufferType* buffer, int64 size) override {
     for (auto& algorithm : algorithms_) {
       algorithm->Free(buffer, size);
     }
@@ -463,7 +479,7 @@ class ChooseBestHeapAlgorithm : public HeapAlgorithm {
   Result Finish() override;
 
  private:
-  std::vector<std::unique_ptr<HeapAlgorithm>> algorithms_;
+  std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index b5b711cab4f..8f7668b4965 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -228,7 +228,7 @@ const char kFinish[] = "Finish";
 using CallSequence = std::vector<std::pair<string, const HloValue*>>;
 
 // HeapCallRecorder is a dummy heap algorithm that simply records its calls.
-class HeapCallRecorder : public HeapAlgorithm {
+class HeapCallRecorder : public HeapAlgorithm<HloValue> {
  public:
   explicit HeapCallRecorder(CallSequence* calls) : calls_(calls) {}
   ~HeapCallRecorder() override {}
@@ -396,7 +396,7 @@ class HeapSimulatorTracker {
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
   CallSequence actual_calls_;
-  HeapSimulator::Result result_;
+  HeapSimulator::Result<HloValue> result_;
 };
 
 class HeapSimulatorTest : public HloTestBase {
@@ -976,12 +976,12 @@ class HeapAlgorithmTestBase : public ::testing::Test {
 class NoFragmentationStatsHeapTest : public HeapAlgorithmTestBase {};
 
 TEST_F(NoFragmentationStatsHeapTest, Empty) {
-  NoFragmentationStatsHeap heap;
+  NoFragmentationStatsHeap<HloValue> heap;
   EXPECT_EQ(0, heap.Finish().heap_size);
 }
 
 TEST_F(NoFragmentationStatsHeapTest, Simple) {
-  NoFragmentationStatsHeap heap;
+  NoFragmentationStatsHeap<HloValue> heap;
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 30);
@@ -994,7 +994,7 @@ TEST_F(NoFragmentationStatsHeapTest, Simple) {
 }
 
 TEST_F(NoFragmentationStatsHeapTest, Mixed) {
-  NoFragmentationStatsHeap heap;
+  NoFragmentationStatsHeap<HloValue> heap;
   heap.Alloc(buffer_a_, 10);  // max: A
 
   heap.Alloc(buffer_b_, 20);  // max: A+B
@@ -1013,7 +1013,7 @@ TEST_F(NoFragmentationStatsHeapTest, Mixed) {
 class GlobalDecreasingSizeBestFitHeapTest : public HeapAlgorithmTestBase {
  protected:
   class InheritedGlobalDecreasingSizeBestFitHeap
-      : public GlobalDecreasingSizeBestFitHeap {
+      : public GlobalDecreasingSizeBestFitHeap<HloValue> {
    public:
     InheritedGlobalDecreasingSizeBestFitHeap()
         : GlobalDecreasingSizeBestFitHeap(/*alignment=*/1) {}
@@ -1048,8 +1048,8 @@ class GlobalDecreasingSizeBestFitHeapTest : public HeapAlgorithmTestBase {
 };
 
 TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
-  const HeapSimulator::Result result = heap.Finish();
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(0, result.heap_size);
   EXPECT_EQ(0, result.chunk_map.size());
 }
@@ -1068,7 +1068,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
   //   |         |   d   |
   //   |         +-------+
   //   -----------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 30);
   heap.Alloc(buffer_c_, 20);
@@ -1078,7 +1078,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
   heap.Free(buffer_c_, 20);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(100, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@@ -1107,7 +1107,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
   //   |         |       |
   //   |         +-------+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/20);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/20);
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 50);
@@ -1117,7 +1117,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
   heap.Free(buffer_c_, 50);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(120, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1148,7 +1148,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
   //   |           |       |
   //   |           +-------+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 40);
@@ -1160,7 +1160,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
   heap.Free(buffer_d_, 30);
   heap.Free(buffer_e_, 50);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(140, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1184,7 +1184,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
   //   ||      |+----+|       |
   //   |+--a---++-b--++---c---+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 40);
   heap.Free(buffer_a_, 40);
   heap.Alloc(buffer_b_, 20);
@@ -1192,7 +1192,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
   heap.ShareWith(buffer_c_, buffer_a_, 40);
   heap.Free(buffer_c_, 40);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1212,7 +1212,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
   //   ||      |      |       | <--- colocate with a
   //   |+--a---+      +---c---+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 40);
   heap.Free(buffer_a_, 40);
   heap.Alloc(buffer_b_, 20);
@@ -1221,7 +1221,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
   heap.Free(buffer_c_, 40);
   heap.Free(buffer_b_, 20);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(60, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1242,7 +1242,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
   //   |       |               |
   //   |       +-------b-------+
   //   ---------------------> time
-  GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1);
+  GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   heap.Alloc(buffer_a_, 10);
   heap.Free(buffer_a_, 10);
   heap.Alloc(buffer_b_, 30);
@@ -1251,7 +1251,7 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
   heap.Free(buffer_c_, 10);
   heap.Free(buffer_b_, 30);
 
-  const HeapSimulator::Result result = heap.Finish();
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 17a7b18c84b..c3a7b3a5c14 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 73
+// Next ID: 74
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -251,6 +251,9 @@ message HloInstructionProto {
 
   // The comparison type used for kCompare.
   string comparison_type = 72;
+
+  // Specifies if this is a cross-program-prefetch, used by kCopyStart.
+  bool is_cross_program_prefetch = 73;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 438aa6ff05f..14daf680ac9 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -545,7 +545,7 @@ string HloComputation::ToString(
     if (options.print_percent()) {
       s << "%";
     }
-    if (options.print_ids() || !IsEntryComputation()) {
+    if (options.print_ids()) {
       // Exclude entry computation's name because it includes and leads to
       // non-deterministic fingerprint.
       s << PrintName(name(), options.print_ids()) << " ";
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 72b15db0dcd..939c713fc18 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -486,6 +486,10 @@ Status HloCostAnalysis::HandleReshape(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleDynamicReshape(const HloInstruction*) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleBatchNormTraining(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-training.
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index d9085dd7785..f101e3819c9 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -113,6 +113,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleBroadcast(const HloInstruction* broadcast) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleDynamicReshape(const HloInstruction* reshape) override;
   Status HandleAddDependency(const HloInstruction* add_dependency) override;
   Status HandleAfterAll(const HloInstruction* token) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 1bbbb248bbc..551ffb52031 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1229,10 +1229,10 @@ TEST_P(HloDataflowAnalysisTest, CopyStartAndCopyDone) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto copy_start = builder.AddInstruction(HloInstruction::CreateUnary(
+  auto copy_start = builder.AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({constant->shape(), constant->shape(),
                                  ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, constant));
+      constant));
   auto copy_done = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kCopyDone, copy_start));
   module_->AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
deleted file mode 100644
index 9415e20af7b..00000000000
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
-
-#include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-
-namespace xla {
-
-namespace {
-
-StatusOr<bool> ReplaceGetSize(
-    HloInstruction* instr,
-    DynamicDimensionInference* dynamic_dimension_inference) {
-  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
-    return false;
-  }
-  HloComputation* computation = instr->parent();
-
-  TF_ASSIGN_OR_RETURN(auto legal_shape,
-                      ShapeInference::InferGetDimensionSizeShape(
-                          instr->operand(0)->shape(), instr->dimension()));
-  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape))
-      << "instr->shape() " << instr->shape().ToString() << " , "
-      << "legal_shape " << legal_shape.ToString();
-  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), S32));
-  HloInstruction* operand = instr->mutable_operand(0);
-  int64 dim = instr->dimension();
-  HloInstruction* dynamic_size =
-      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
-  if (dynamic_size != nullptr) {
-    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
-    // The dependency between a instruction and its dynamic dimensions is not
-    // modeled in the IR. As instr is being replaced by dynamic_size, also tell
-    // dynamic dimension inference that the instruction is being replaced.
-    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(
-        instr, dynamic_size);
-  } else {
-    int32 size = instr->operand(0)->shape().dimensions(dim);
-    HloInstruction* new_instr = computation->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(size)));
-    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
-    dynamic_dimension_inference->ReplaceAllDynamicDimensionUsesWith(instr,
-                                                                    new_instr);
-  }
-  return true;
-}
-
-StatusOr<bool> ReplaceSetSize(HloInstruction* instr) {
-  if (instr->opcode() != HloOpcode::kSetDimensionSize) {
-    return false;
-  }
-
-  TF_RET_CHECK(Shape::Equal().IgnoreDynamicDimension()(
-      instr->shape(), instr->operand(0)->shape()))
-      << "instr->shape() " << instr->shape().ToString() << " , "
-      << "instruction operand shape " << instr->operand(0)->shape();
-  HloInstruction* operand = instr->mutable_operand(0);
-
-  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(operand));
-  return true;
-}
-
-}  // namespace
-
-StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
-  bool changed = false;
-  HloProto proto;
-  TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
-                      DynamicDimensionInference::Run(module));
-  *proto.mutable_hlo_module() = module->ToProto();
-  // It's important to replace get-dimension-size first before
-  // set-dimension-size for the case below:
-  //  static_op    dynamic_size
-  //    |             |
-  //  set-dimension-size // Marks the dimension as dynamic
-  //    |
-  //  get-dimension-size
-  //
-  // If we replace set dimension size first, we'd have
-  //
-  //  static_op
-  //    |
-  //  get-dimension-size
-  //
-  // This will get static size of the op, which is incorrect.
-  for (auto* computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool replaced_get_size,
-                          ReplaceGetSize(instruction, &inference));
-      changed = changed || replaced_get_size;
-    }
-  }
-  for (auto* computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool replaced_set_size, ReplaceSetSize(instruction));
-      changed = changed || replaced_set_size;
-    }
-  }
-  return changed;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
deleted file mode 100644
index b1491e96095..00000000000
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-namespace op = xla::testing::opcode_matchers;
-
-class HloGetDimensionSizeRewriterTest : public HloTestBase {
- protected:
-  HloGetDimensionSizeRewriterTest() {}
-};
-
-TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = s32[3,4] parameter(0)
-  size0 = s32[] get-dimension-size(p), dimensions={0}
-  size1 = s32[] get-dimension-size(p), dimensions={1}
-  ROOT mul = s32[] multiply(size0, size1)
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Multiply(op::Constant(), op::Constant()));
-}
-
-TEST_F(HloGetDimensionSizeRewriterTest, GetSetSetDimensionSizeRewriter) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = s32[3,4] parameter(0)
-  size0 = s32[] get-dimension-size(p), dimensions={0}
-  p_copy = s32[3,4] copy(p)
-  p_copy_dynamic = s32[<=3, 4] set-dimension-size(p_copy, size0), dimensions={0}
-  size1 = s32[] get-dimension-size(p_copy_dynamic), dimensions={0}
-  ROOT mul = s32[] multiply(size0, size1)
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Multiply(op::Constant(), op::Constant()));
-}
-
-TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) {
-  auto module = ParseAndReturnUnverifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = s32[3]{0} parameter(0)
-  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_FALSE(pass.Run(module.get()).ok());
-}
-
-TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
-  auto module = ParseAndReturnUnverifiedModule(R"(
-HloModule _
-ENTRY gds {
-  p = f32[2,5] parameter(0)
-  ROOT gds = s32[] get-dimension-size(p), dimensions={2}
-})")
-                    .ValueOrDie();
-  HloGetDimensionSizeRewriter pass;
-  EXPECT_FALSE(pass.Run(module.get()).ok());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d7e8984dee8..164e92ae8e8 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1012,6 +1012,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGather:
     case HloOpcode::kPad:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kTranspose:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 2ce3c12b4e9..bb01fdd0e15 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -167,6 +167,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                               absl::Span<const int64>(fft_length));
       break;
     }
+    case HloOpcode::kCopyStart: {
+      instruction = CreateCopyStart(shape, operands(0),
+                                    proto.is_cross_program_prefetch());
+      break;
+    }
     case HloOpcode::kCompare: {
       // Auto-upgraded from deprecated opcode skips the following.
       if (!comparison_direction) {
@@ -700,6 +705,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateReshape(shape, operands(0), inferred_dimension);
       break;
     }
+    case HloOpcode::kDynamicReshape: {
+      TF_RET_CHECK(shape.IsArray() && operands(0)->shape().IsArray() &&
+                   ShapeUtil::ElementsIn(shape) ==
+                       ShapeUtil::ElementsIn(operands(0)->shape()))
+          << "shape: " << ShapeUtil::HumanString(shape)
+          << " operand: " << ShapeUtil::HumanString(operands(0)->shape());
+      const auto& operand_vector = all_operands();
+      instruction = CreateDynamicReshape(
+          shape, operands(0), absl::MakeSpan(operand_vector).subspan(1));
+      break;
+    }
     default: {
       instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -828,7 +844,6 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kCeil:
     case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
-    case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
     case HloOpcode::kCos:
     case HloOpcode::kClz:
@@ -935,6 +950,13 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
                                               fft_length);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCopyStart(
+    const Shape& shape, HloInstruction* operand,
+    bool is_cross_program_prefetch) {
+  return absl::make_unique<HloCopyStartInstruction>(shape, operand,
+                                                    is_cross_program_prefetch);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCompare(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
     ComparisonDirection direction, absl::optional<Comparison::Type> type) {
@@ -1373,6 +1395,19 @@ HloInstruction::CreateBroadcastSequence(
                                                   inferred_dimension);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateDynamicReshape(
+    const Shape& shape, HloInstruction* data_operand,
+    absl::Span<HloInstruction* const> dim_sizes) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape),
+           ShapeUtil::ElementsIn(data_operand[0].shape()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << " operand: " << ShapeUtil::HumanString(data_operand[0].shape());
+  CHECK_EQ(shape.rank(), dim_sizes.size());
+  return absl::make_unique<HloDynamicReshapeInstruction>(shape, data_operand,
+                                                         dim_sizes);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTranspose(
     const Shape& shape, HloInstruction* operand,
     absl::Span<const int64> dimensions) {
@@ -1569,6 +1604,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kTranspose:
     case HloOpcode::kBroadcast:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kMap:
     case HloOpcode::kSlice:
     case HloOpcode::kConstant:
@@ -2007,6 +2043,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kReplicaId:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kRsqrt:
@@ -2812,7 +2849,8 @@ HloInstructionProto HloInstruction::ToProto() const {
 
 string HloInstruction::ToCategory() const {
   if (opcode() == HloOpcode::kTranspose || opcode() == HloOpcode::kCopy ||
-      opcode() == HloOpcode::kReshape) {
+      opcode() == HloOpcode::kReshape ||
+      opcode() == HloOpcode::kDynamicReshape) {
     return "data formatting";
   }
 
@@ -3033,6 +3071,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
       return visitor->HandleReshape(this);
+    case HloOpcode::kDynamicReshape:
+      return visitor->HandleDynamicReshape(this);
     case HloOpcode::kTranspose:
       return visitor->HandleTranspose(this);
     case HloOpcode::kReverse:
@@ -4089,6 +4129,10 @@ const DomainMetadata& HloInstruction::user_side_metadata() const {
   return Cast<HloDomainInstruction>(this)->user_side_metadata();
 }
 
+bool HloInstruction::is_cross_program_prefetch() const {
+  return Cast<HloCopyStartInstruction>(this)->is_cross_program_prefetch();
+}
+
 ComparisonDirection HloInstruction::comparison_direction() const {
   return Cast<HloCompareInstruction>(this)->direction();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index bdd64c908f0..7db128b4d34 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -592,6 +592,12 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, FftType fft_type,
       absl::Span<const int64> fft_length);
 
+  // Creates a copy-start op, indicating whether this is a cross-program
+  // prefetch or not.
+  static std::unique_ptr<HloInstruction> CreateCopyStart(
+      const Shape& shape, HloInstruction* operand,
+      bool is_cross_program_prefetch = false);
+
   // Creates a compare op, performing the comparison specified in direction.
   static std::unique_ptr<HloInstruction> CreateCompare(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
@@ -879,6 +885,14 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       int64 inferred_dimension = -1);
 
+  // Creates a dynamic reshape instruction. Similar to reshape but dynamic
+  // dimensions sizes are provided as additional variadic arguments.
+  //
+  // Precondition: dim_sizes.size() == shape.rank()
+  static std::unique_ptr<HloInstruction> CreateDynamicReshape(
+      const Shape& shape, HloInstruction* data_operand,
+      absl::Span<HloInstruction* const> dim_sizes);
+
   // Creates a transpose instruction which permutes the operand dimensions.
   static std::unique_ptr<HloInstruction> CreateTranspose(
       const Shape& shape, HloInstruction* operand,
@@ -1857,6 +1871,9 @@ class HloInstruction {
   // Delegates to HloDomainInstruction::user_side_metadata().
   const DomainMetadata& user_side_metadata() const;
 
+  // Delegates to HloCopyStartInstruction::is_cross_program_prefetch().
+  bool is_cross_program_prefetch() const;
+
   // Delegates to HloCompareInstruction::direction().
   ComparisonDirection comparison_direction() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index dbc1d85d1bb..df225e27aad 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -204,6 +204,47 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
                                               fft_length_);
 }
 
+HloCopyStartInstruction::HloCopyStartInstruction(const Shape& shape,
+                                                 HloInstruction* operand,
+                                                 bool is_cross_program_prefetch)
+    : HloInstruction(HloOpcode::kCopyStart, shape),
+      is_cross_program_prefetch_(is_cross_program_prefetch) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloCopyStartInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_is_cross_program_prefetch(is_cross_program_prefetch_);
+  return proto;
+}
+
+std::vector<string> HloCopyStartInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> result;
+  if (is_cross_program_prefetch()) {
+    result.push_back("is_cross_program_prefetch=true");
+  }
+  return result;
+}
+
+bool HloCopyStartInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloCopyStartInstruction&>(other);
+  return is_cross_program_prefetch() ==
+         casted_other.is_cross_program_prefetch();
+}
+
+std::unique_ptr<HloInstruction>
+HloCopyStartInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return absl::make_unique<HloCopyStartInstruction>(
+      shape, new_operands[0], is_cross_program_prefetch());
+}
+
 HloCompareInstruction::HloCompareInstruction(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
     ComparisonDirection direction, absl::optional<Comparison::Type> type)
@@ -1027,6 +1068,25 @@ HloBroadcastInstruction::CloneWithNewOperandsImpl(
                                                     dimensions());
 }
 
+HloDynamicReshapeInstruction::HloDynamicReshapeInstruction(
+    const Shape& shape, HloInstruction* data_operand,
+    absl::Span<HloInstruction* const> dim_sizes)
+    : HloInstruction(HloOpcode::kDynamicReshape, shape) {
+  AppendOperand(data_operand);
+  for (auto operand : dim_sizes) {
+    AppendOperand(operand);
+  }
+}
+
+std::unique_ptr<HloInstruction>
+HloDynamicReshapeInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_GE(new_operands.size(), 1);
+  return absl::make_unique<HloDynamicReshapeInstruction>(
+      shape, new_operands[0], new_operands.subspan(1));
+}
+
 HloReshapeInstruction::HloReshapeInstruction(const Shape& shape,
                                              HloInstruction* operand,
                                              int64 inferred_dimension)
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 3f92bb92f02..17368e8b714 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -132,6 +132,28 @@ class HloFftInstruction : public HloInstruction {
   std::vector<int64> fft_length_;
 };
 
+class HloCopyStartInstruction : public HloInstruction {
+ public:
+  explicit HloCopyStartInstruction(const Shape& shape, HloInstruction* operand,
+                                   bool is_cross_program_prefetch);
+
+  bool is_cross_program_prefetch() const { return is_cross_program_prefetch_; }
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  bool is_cross_program_prefetch_;
+};
+
 class HloCompareInstruction : public HloInstruction {
  public:
   explicit HloCompareInstruction(const Shape& shape, HloInstruction* lhs,
@@ -679,6 +701,25 @@ class HloBroadcastInstruction : public HloInstruction {
   std::vector<int64> dimensions_;
 };
 
+class HloDynamicReshapeInstruction : public HloInstruction {
+ public:
+  explicit HloDynamicReshapeInstruction(
+      const Shape& shape, HloInstruction* data_operand,
+      absl::Span<HloInstruction* const> dim_sizes);
+
+  // Returns the input dim sizes dimensions, which is operands[1:]
+  absl::Span<HloInstruction* const> dim_sizes() const {
+    return absl::MakeSpan(operands()).subspan(1, operand_count());
+  }
+
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  // Returns the input dim size dimension, which is operands[1+i]
+  HloInstruction* dim_sizes(int64 i) const { return operands()[i + 1]; }
+};
+
 class HloReshapeInstruction : public HloInstruction {
  public:
   explicit HloReshapeInstruction(const Shape& shape, HloInstruction* operand,
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index cb5cbd05d65..9c6509d8b73 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -276,10 +276,10 @@ TEST_F(HloMatchersTest, AsyncCopyMatcher) {
       /*element_size_in_bits=*/0, /*memory_space=*/2);
 
   auto p0 = HloInstruction::CreateParameter(0, shape_memspace1, "p0");
-  auto copy_start = HloInstruction::CreateUnary(
+  auto copy_start = HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape(
           {shape_memspace2, shape_memspace1, ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, p0.get());
+      p0.get());
   auto copy_done = HloInstruction::CreateUnary(
       shape_memspace2, HloOpcode::kCopyDone, copy_start.get());
 
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 8ee8d332aff..076e31dc8eb 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -50,9 +50,9 @@ int64 PeakMemoryUseOfEntryComputation(
 
   HloComputation* computation = module->entry_computation();
   const HloInstructionSequence& sequence = schedule.sequence(computation);
-  return HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
-                            *computation, sequence, *alias_analysis,
-                            size_function)
+  return HeapSimulator::Run(
+             absl::make_unique<NoFragmentationStatsHeap<HloValue>>(),
+             *computation, sequence, *alias_analysis, size_function)
       .ValueOrDie()
       .heap_size;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 1625d0bbae4..b50c7d9a584 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -123,6 +123,7 @@ namespace xla {
   V(kRemainder, "remainder", 2)                                        \
   V(kReplicaId, "replica-id", 0)                                       \
   V(kReshape, "reshape", 1)                                            \
+  V(kDynamicReshape, "dynamic-reshape", kHloOpcodeIsVariadic)          \
   V(kReverse, "reverse", 1)                                            \
   V(kRng, "rng", kHloOpcodeIsVariadic)                                 \
   V(kRngGetAndUpdateState, "rng-get-and-update-state", 0)              \
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 136e6702b21..cceb60a70e9 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -58,6 +58,7 @@ TEST(HloOpcodeTest, OpcodeProperties) {
       case HloOpcode::kCustomCall:
       case HloOpcode::kDynamicSlice:
       case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kDynamicReshape:
       case HloOpcode::kFusion:
       case HloOpcode::kMap:
       case HloOpcode::kReduce:
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 2afa06a5df4..e2bbda3a607 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -883,7 +883,6 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kClz:
     case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
-    case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1091,6 +1090,20 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       }
       break;
     }
+    case HloOpcode::kCopyStart: {
+      // If the is_cross_program_prefetch attribute is not present then default
+      // to false.
+      optional<bool> is_cross_program_prefetch = false;
+      attrs["is_cross_program_prefetch"] = {/*required=*/false, AttrTy::kBool,
+                                            &is_cross_program_prefetch};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateCopyStart(
+          shape, operands[0], *is_cross_program_prefetch));
+      break;
+    }
     case HloOpcode::kReplicaId: {
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
           !ParseAttributes(attrs)) {
@@ -1108,6 +1121,16 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           builder->AddInstruction(HloInstruction::CreatePartitionId());
       break;
     }
+    case HloOpcode::kDynamicReshape: {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateDynamicReshape(
+              shape, operands[0],
+              absl::Span<HloInstruction* const>(operands).subspan(1)));
+      break;
+    }
     case HloOpcode::kReshape: {
       optional<int64> inferred_dimension;
       attrs["inferred_dimension"] = {/*required=*/false, AttrTy::kInt64,
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index aba6aeff999..620e67c3a2f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -318,7 +318,7 @@ R"(HloModule CopyStartAndCopyDone_module
 
 ENTRY %CopyStartAndCopyDone (v1: f32[], v2: f32[2,3]) -> (f32[], f32[2,3]) {
   %v1 = f32[] parameter(0)
-  %copy-start.1 = (f32[], f32[], u32[]) copy-start(f32[] %v1)
+  %copy-start.1 = (f32[], f32[], u32[]) copy-start(f32[] %v1), is_cross_program_prefetch=true
   %copy-done.1 = f32[] copy-done((f32[], f32[], u32[]) %copy-start.1)
   %v2 = f32[2,3]{1,0:S(1)} parameter(1)
   %copy-start.2 = (f32[2,3]{1,0:S(2)}, f32[2,3]{1,0:S(1)}, u32[]) copy-start(f32[2,3]{1,0:S(1)} %v2)
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 83130108dd7..3a5e7ca6f40 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -259,9 +259,15 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   return ExecuteReplicated(executable.get(), options, device_assignment);
 }
 
-StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
-    Executable* executable, const ReplicatedExecuteOptions& options,
-    DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
+    std::function<StatusOr<std::vector<ScopedShapedBuffer>>(
+        const std::vector<ServiceExecutableRunOptions>&,
+        const std::vector<absl::Span<const ShapedBuffer* const>>&)>
+        execution_helper,
+    std::function<int64(int64)> argument_count_provider,
+    std::function<const Literal*(int64, int64)> argument_provider,
+    const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment) {
   std::vector<std::unique_ptr<se::Stream>> streams;
   std::vector<ServiceExecutableRunOptions> service_run_options;
 
@@ -269,12 +275,19 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   // This reserve() call is necessary for correctness, because
   // argument_buffer_ptrs contains pointers into the elements of
   // argument_buffers.
-  argument_buffers.reserve(options.num_replicas * options.arguments.size());
+  const int64 total_argument_count = [&]() {
+    int64 total = 0;
+    for (int64 i = 0; i < options.num_replicas; ++i) {
+      total += argument_count_provider(i);
+    }
+    return total;
+  }();
+  argument_buffers.reserve(total_argument_count);
 
   // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are
   // no arguments.
-  std::vector<const ShapedBuffer*> argument_buffer_ptrs(
-      options.num_replicas * options.arguments.size() + 1);
+  std::vector<const ShapedBuffer*> argument_buffer_ptrs(total_argument_count +
+                                                        1);
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64 index = 0;
   RunId run_id;
@@ -288,7 +301,10 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         device, streams.back().get(), device_assignment, run_id));
 
     // Copy arguments to device.
-    for (const Literal* argument : options.arguments) {
+    const int64 argument_count = argument_count_provider(i);
+    for (int64 arg_index = 0; arg_index < argument_count; arg_index++) {
+      const Literal* const argument = argument_provider(i, arg_index);
+      TF_RET_CHECK(argument != nullptr);
       TF_ASSIGN_OR_RETURN(
           ScopedShapedBuffer argument_buffer,
           backend().transfer_manager()->AllocateScopedShapedBuffer(
@@ -299,8 +315,7 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
       argument_buffer_ptrs[index++] = &argument_buffers.back();
     }
     argument_buffer_slices.emplace_back(
-        &argument_buffer_ptrs[index - options.arguments.size()],
-        options.arguments.size());
+        &argument_buffer_ptrs[index - argument_count], argument_count);
   }
 
   std::unique_ptr<tensorflow::thread::ThreadPool> pool;
@@ -355,39 +370,9 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   }
 
   LOG(INFO) << "Replicated execution started";
-  std::vector<ScopedShapedBuffer> results;
-  if (!options.use_threads) {
-    TF_ASSIGN_OR_RETURN(results,
-                        executable->ExecuteOnStreams(service_run_options,
-                                                     argument_buffer_slices));
-  } else {
-    tensorflow::mutex mutex;
-    std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
-        options.num_replicas);
-    {
-      LOG(INFO) << "Creating thread pool for " << options.num_replicas
-                << " replicas";
-      tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
-                                          "replicas", options.num_replicas);
-      for (int64 i = 0; i < options.num_replicas; ++i) {
-        pool.Schedule([&, i] {
-          auto result = executable->ExecuteOnStream(
-              &service_run_options[i], argument_buffer_slices[i], nullptr);
-          tensorflow::mutex_lock lock(mutex);
-          thread_results[i] = std::move(result);
-        });
-      }
-
-      // Note: the thread pool destructor guarantees it completes all work
-      // before we leave this scope.
-    }
-    for (auto& thread_result : thread_results) {
-      if (!thread_result.ok()) {
-        return thread_result.status();
-      }
-      results.push_back(std::move(thread_result).ValueOrDie());
-    }
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::vector<ScopedShapedBuffer> results,
+      execution_helper(service_run_options, argument_buffer_slices));
   LOG(INFO) << "Replicated execution terminated";
 
   std::vector<Literal> exec_results;
@@ -401,6 +386,104 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   return std::move(exec_results);
 }
 
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    Executable* executable, const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+  return ExecuteReplicatedImpl(
+      [&](const std::vector<ServiceExecutableRunOptions>& service_run_options,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&
+              argument_buffer_slices)
+          -> StatusOr<std::vector<ScopedShapedBuffer>> {
+        std::vector<ScopedShapedBuffer> results;
+        if (!options.use_threads) {
+          TF_ASSIGN_OR_RETURN(
+              results, executable->ExecuteOnStreams(service_run_options,
+                                                    argument_buffer_slices));
+        } else {
+          tensorflow::mutex mutex;
+          std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+              options.num_replicas);
+          {
+            LOG(INFO) << "Creating thread pool for " << options.num_replicas
+                      << " replicas";
+            tensorflow::thread::ThreadPool pool(
+                tensorflow::Env::Default(), "replicas", options.num_replicas);
+            for (int64 i = 0; i < options.num_replicas; ++i) {
+              pool.Schedule([&, i] {
+                auto result = executable->ExecuteOnStream(
+                    &service_run_options[i], argument_buffer_slices[i],
+                    nullptr);
+                tensorflow::mutex_lock lock(mutex);
+                thread_results[i] = std::move(result);
+              });
+            }
+
+            // Note: the thread pool destructor guarantees it completes all work
+            // before we leave this scope.
+          }
+          for (auto& thread_result : thread_results) {
+            if (!thread_result.ok()) {
+              return thread_result.status();
+            }
+            results.push_back(std::move(thread_result).ValueOrDie());
+          }
+        }
+        return results;
+      },
+      [&](int64 replica) { return options.arguments.size(); },
+      [&](int64 replica, int64 index) { return options.arguments[index]; },
+      options, device_assignment);
+}
+
+StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
+    std::function<Executable*(int64)> executable_provider,
+    std::function<int64(int64)> argument_count_provider,
+    std::function<const Literal*(int64, int64)> argument_provider,
+    const ReplicatedExecuteOptions& options) {
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+  return ExecuteReplicatedImpl(
+      [&](const std::vector<ServiceExecutableRunOptions>& service_run_options,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&
+              argument_buffer_slices)
+          -> StatusOr<std::vector<ScopedShapedBuffer>> {
+        TF_RET_CHECK(options.use_threads);
+        std::vector<ScopedShapedBuffer> results;
+        tensorflow::mutex mutex;
+        std::vector<StatusOr<ScopedShapedBuffer>> thread_results(
+            options.num_replicas);
+        {
+          LOG(INFO) << "Creating thread pool for " << options.num_replicas
+                    << " replicas";
+          tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
+                                              "replicas", options.num_replicas);
+          for (int64 i = 0; i < options.num_replicas; ++i) {
+            for (const auto& arg : argument_buffer_slices[i]) {
+              TF_RET_CHECK(arg != nullptr);
+            }
+            pool.Schedule([&, i] {
+              auto result = executable_provider(i)->ExecuteOnStream(
+                  &service_run_options[i], argument_buffer_slices[i], nullptr);
+              tensorflow::mutex_lock lock(mutex);
+              thread_results[i] = std::move(result);
+            });
+          }
+
+          // Note: the thread pool destructor guarantees it completes all work
+          // before we leave this scope.
+        }
+        for (auto& thread_result : thread_results) {
+          if (!thread_result.ok()) {
+            return thread_result.status();
+          }
+          results.push_back(std::move(thread_result).ValueOrDie());
+        }
+        return results;
+      },
+      argument_count_provider, argument_provider, options, &device_assignment);
+}
+
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const ReplicatedExecuteOptions& options) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 7e8b301ab54..733bb8bff54 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -176,6 +176,17 @@ class HloRunner {
       Executable* executable, const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment, ExecutionProfile* profile = nullptr);
 
+  // Same as above, but with different reusable Executables. This may update the
+  // profile information in *executables.
+  //
+  // Note that this call ignores ReplicatedExecutionOptions::run_hlo_passes,
+  // since we've already compiled the Executable.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64)> executable_provider,
+      std::function<int64(int64)> argument_count_provider,
+      std::function<const Literal*(int64, int64)> argument_provider,
+      const ReplicatedExecuteOptions& options);
+
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
   //
@@ -193,6 +204,17 @@ class HloRunner {
       int64 device, se::Stream* stream, DeviceAssignment* device_assignment,
       RunId run_id);
 
+  // Common implementation code for ExecuteReplicated() above.
+  StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
+      std::function<StatusOr<std::vector<ScopedShapedBuffer>>(
+          const std::vector<ServiceExecutableRunOptions>&,
+          const std::vector<absl::Span<const ShapedBuffer* const>>&)>
+          execution_helper,
+      std::function<int64(int64)> argument_count_provider,
+      std::function<const Literal*(int64, int64)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment);
+
   std::unique_ptr<Backend> backend_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
index 007b6158fc2..e1e506b2892 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -106,21 +106,28 @@ HloSharding TransposeSharding(const HloSharding& sharding,
   if (sharding.IsTileMaximal()) {
     return sharding;
   }
-  const int64 rank = dimensions.size();
+  auto perm_dimensions = dimensions;
+  if (sharding.ReplicateOnLastTileDim() &&
+      dimensions.size() < sharding.tile_assignment().num_dimensions()) {
+    perm_dimensions.push_back(dimensions.size());
+  }
+  const int64 rank = perm_dimensions.size();
   std::vector<int64> tile_assignment_dim(rank);
   for (int64 i = 0; i < rank; ++i) {
-    tile_assignment_dim[i] = sharding.tile_assignment().dim(dimensions[i]);
+    tile_assignment_dim[i] = sharding.tile_assignment().dim(perm_dimensions[i]);
   }
   Array<int64> tile_assignment = sharding.tile_assignment();
   tile_assignment.Reshape(tile_assignment_dim);
   tile_assignment.Each([&](absl::Span<const int64> indices, int64* value) {
     std::vector<int64> src_indices(indices.size(), -1);
     for (int64 i = 0; i < indices.size(); ++i) {
-      src_indices[dimensions[i]] = indices[i];
+      src_indices[perm_dimensions[i]] = indices[i];
     }
     *value = sharding.tile_assignment()(src_indices);
   });
-  return HloSharding::Tile(tile_assignment);
+  return sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(tile_assignment)
+             : HloSharding::Tile(tile_assignment);
 }
 
 absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
@@ -227,8 +234,14 @@ absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
     }
   }
   Array<int64> new_tile_assignment = sharding.tile_assignment();
+  if (sharding.ReplicateOnLastTileDim()) {
+    target_tile_assignment_dimensions.push_back(
+        sharding.tile_assignment().dimensions().back());
+  }
   new_tile_assignment.Reshape(target_tile_assignment_dimensions);
-  return HloSharding::Tile(new_tile_assignment);
+  return sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ReverseSharding(const HloSharding& sharding,
@@ -246,7 +259,9 @@ HloSharding ReverseSharding(const HloSharding& sharding,
     }
     *device = sharding.tile_assignment()(original_indices);
   });
-  return HloSharding::Tile(new_tile_assignment);
+  return sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
@@ -343,6 +358,7 @@ HloSharding GatherOutputSharding(const HloSharding& index_sharding,
 
 HloSharding GatherIndexSharding(const HloSharding& output_sharding,
                                 const HloInstruction* hlo) {
+  CHECK(hlo->opcode() == HloOpcode::kGather);
   if (output_sharding.IsTileMaximal()) {
     return output_sharding;
   }
@@ -355,6 +371,14 @@ HloSharding GatherIndexSharding(const HloSharding& output_sharding,
           output_sharding.tile_assignment().dim(i));
     }
   }
+  int64 index_rank = hlo->operand(1)->shape().rank();
+
+  // Vector indices sharding is not supported yet.
+  if (index_rank > index_tile_assignment_dims.size()) {
+    index_tile_assignment_dims.insert(
+        index_tile_assignment_dims.begin() + dnums.index_vector_dim(), 1);
+  }
+
   Array<int64> new_tile_assignment = output_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(index_tile_assignment_dims)) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index d395fddcc5d..0af2a45bfc7 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -703,6 +703,20 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
+Status ShapeVerifier::HandleDynamicReshape(HloInstruction* dynamic_reshape) {
+  // Check for mixed precision.
+  const Shape& operand_shape = dynamic_reshape->operand(0)->shape();
+  TF_RET_CHECK(SameElementType(dynamic_reshape->shape(), operand_shape));
+  TF_RET_CHECK(ShapeUtil::ElementsIn(dynamic_reshape->shape()) ==
+               ShapeUtil::ElementsIn(operand_shape));
+  TF_RET_CHECK(dynamic_reshape->shape().rank() + 1 ==
+               dynamic_reshape->operand_count());
+  for (int64 i = 1; i < dynamic_reshape->operand_count(); ++i) {
+    TF_RET_CHECK(dynamic_reshape->operand(i)->shape().element_type() == S32);
+  }
+  return Status::OK();
+}
+
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   const Shape& operand_shape = reshape->operand(0)->shape();
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 85b02e0518c..03fca5938ff 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -78,6 +78,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
   Status HandleReshape(HloInstruction* reshape) override;
+  Status HandleDynamicReshape(HloInstruction* dynamic_reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
   Status HandleFusion(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 8d8930615b2..b290b1bd68b 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -102,6 +102,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kReducePrecision:
     case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kSelect:
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index cc7fdeaf0f6..1446b55f5a8 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -52,6 +52,7 @@ InterpreterExecutable::InterpreterExecutable(
 }
 
 StatusOr<Literal> InterpreterExecutable::Evaluate(
+    const ServiceExecutableRunOptions* run_options,
     const HloComputation& computation, absl::Span<const Literal> arg_literals) {
   // Execute the graph using the HloEvaluator.
   tensorflow::mutex_lock lock(evaluator_lock_);
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index ce68a8472f5..514ed029a22 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -51,7 +51,8 @@ class InterpreterExecutable : public InterpreterExecutableBase {
   static int64 ShapeSizeBytes(const Shape& shape);
 
  protected:
-  StatusOr<Literal> Evaluate(const HloComputation& computation,
+  StatusOr<Literal> Evaluate(const ServiceExecutableRunOptions* run_options,
+                             const HloComputation& computation,
                              absl::Span<const Literal> arg_literals) override
       TF_LOCKS_EXCLUDED(evaluator_lock_);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
index 4b6a8aa5202..745750bffe1 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -50,11 +50,15 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   // TransferManager methods below.
   std::vector<ShapedBuffer> argument_buffers;
   argument_buffers.reserve(arguments.size());
+  int device_ordinal = run_options->device_ordinal();
+  if (device_ordinal < 0) {
+    device_ordinal = 0;
+  }
   for (auto& argument : arguments) {
     const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
     argument_buffers.push_back(ShapedBuffer(buffers.shape(), buffers.shape(),
                                             /*platform=*/nullptr,
-                                            /*device_ordinal=*/0));
+                                            /*device_ordinal=*/device_ordinal));
     auto in_it = buffers.begin();
     auto out_it = argument_buffers.back().buffers().begin();
     for (; in_it != buffers.end(); ++in_it, ++out_it) {
@@ -118,7 +122,7 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   }
 
   TF_ASSIGN_OR_RETURN(Literal result_literal,
-                      Evaluate(*computation, arg_literals));
+                      Evaluate(run_options, *computation, arg_literals));
   // Shrink the generated dynamic shape into static shape.
   result_literal = result_literal.ToStatic();
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.h b/tensorflow/compiler/xla/service/interpreter/executable_base.h
index a02ab7af8d0..eb47841a179 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.h
@@ -44,6 +44,7 @@ class InterpreterExecutableBase : public Executable {
 
  protected:
   virtual StatusOr<Literal> Evaluate(
+      const ServiceExecutableRunOptions* run_options,
       const HloComputation& computation,
       absl::Span<const Literal> arg_literals) = 0;
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bea0f1fb93c..55569cfde0e 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1891,7 +1891,7 @@ Status LayoutAssignment::RunOnComputation(
             ? ShapeUtil::GetSubshape(instruction->literal().shape(),
                                      buffer.index())
                   .layout()
-            : LayoutUtil::GetDefaultLayoutForShape(buffer.shape());
+            : GetUnconstrainedLayout(buffer);
     TF_RETURN_IF_ERROR(constraints.SetBufferLayout(new_layout, buffer,
                                                    /*mandatory=*/false));
 
@@ -2278,6 +2278,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kReduce:
     case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kRng:
     case HloOpcode::kRngBitGenerator:
     case HloOpcode::kRngGetAndUpdateState:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index a04d056c618..def620bcee9 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -338,6 +339,9 @@ class LayoutAssignment : public HloModulePass {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
+  virtual Layout GetUnconstrainedLayout(const LogicalBuffer& buffer) {
+    return LayoutUtil::GetDefaultLayoutForShape(buffer.shape());
+  }
   // Called after layouts of an instruction have been finalized to allow
   // subclasses to check for platform specific assumptions.
   virtual Status Verify(const HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index b01ae2efe43..2963d546380 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -415,9 +415,10 @@ llvm::Instruction* AddRangeMetadata(int64 lower, int64 upper,
   return inst;
 }
 
-string IrName(string a) {
-  a.erase(std::remove(a.begin(), a.end(), '%'), a.end());
-  return a;
+string IrName(absl::string_view a) {
+  std::string s(a);
+  s.erase(std::remove(s.begin(), s.end(), '%'), s.end());
+  return s;
 }
 
 string IrName(absl::string_view a, absl::string_view b) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 642965b6470..c0a55e4da33 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -87,7 +87,7 @@ string DumpModuleToString(const llvm::Module& module);
 //   - joining all of the nonempty inputs by '.', and then
 //   - removing all '%'s.
 //
-string IrName(string a);
+string IrName(absl::string_view a);
 string IrName(absl::string_view a, absl::string_view b);
 string IrName(const HloInstruction* a, absl::string_view b = "");
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index c5ae0573bed..c53f2c19695 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -80,7 +80,7 @@ float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
 }
 
 float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
     MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
   const HloInstruction& defining_instruction =
       *interval.buffer->defining_instruction();
@@ -236,15 +236,26 @@ int64 InstructionCountPrefetchIntervalPicker::PreferredEvictionEndTime(
 }
 
 int64 InstructionCountPrefetchIntervalPicker::LatestPrefetchStartTime(
-    const HloUse& use, int64 start_time, int64 end_time) const {
+    const Shape& shape, int64 start_time, int64 end_time,
+    const HloUse* use) const {
   return end_time - min_overlap_count_;
 }
 
+int64 InstructionCountPrefetchIntervalPicker::PreferredPrefetchStartTime(
+    const Shape& shape, int64 earliest_prefetch_start_time,
+    int64 latest_prefetch_start_time, int64 prefetch_end_time) const {
+  return std::max(earliest_prefetch_start_time,
+                  prefetch_end_time - max_overlap_count_);
+}
+
 void InstructionCountPrefetchIntervalPicker::Begin(const HloUse& use,
                                                    int64 start_time,
                                                    int64 end_time) {
   end_time_ = end_time;
-  current_prefetch_time_ = std::max(start_time, end_time_ - max_overlap_count_);
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+  current_prefetch_time_ =
+      PreferredPrefetchStartTime(shape, start_time, end_time, end_time);
 }
 
 int64 InstructionCountPrefetchIntervalPicker::Next() {
@@ -361,18 +372,22 @@ int64 CostAnalysisPrefetchIntervalPicker::PreferredEvictionEndTime(
 }
 
 int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
-    const HloUse& use, int64 start_time, int64 end_time) const {
-  const Shape& shape = ShapeUtil::GetSubshape(
-      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+    const Shape& shape, int64 start_time, int64 end_time,
+    const HloUse* use) const {
   // Find the earliest time that satisfies max_async_copy_to_overlap_ratio_.
   float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
-  // Estimate the time we would save by having this op in alternate memory.
-  float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
-  float elapsed_time_in_alternate_mem =
-      cost_analysis_.GetInstructionElapsedInAlternateMemory(
-          *use.instruction, use.operand_number,
-          /*output_in_alternate_mem=*/false);
-  float inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
+  // If there is a use, estimate the time we would save by having this op in
+  // alternate memory.
+  float inst_elapsed_reduction = 0.0f;
+  if (use) {
+    float elapsed_time =
+        cost_analysis_.GetInstructionElapsed(*use->instruction);
+    float elapsed_time_in_alternate_mem =
+        cost_analysis_.GetInstructionElapsedInAlternateMemory(
+            *use->instruction, use->operand_number,
+            /*output_in_alternate_mem=*/false);
+    inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
+  }
   int end_nest_level = while_nest_level_[end_time];
 
   // Find the latest time we're allowed to start prefetching.
@@ -390,6 +405,33 @@ int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
   return latest_prefetch_time;
 }
 
+int64 CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
+    const Shape& shape, int64 earliest_prefetch_start_time,
+    int64 latest_prefetch_start_time, int64 prefetch_end_time) const {
+  // Between the earliest and latest prefetch interval, find the interval
+  // closest to the preferred interval and start iterating from there.
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  int64 preferred_prefetch_start_time = earliest_prefetch_start_time;
+  float preferred_interval =
+      preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed;
+  float best_interval = GetLogicalIntervalElapsed(earliest_prefetch_start_time,
+                                                  prefetch_end_time);
+  int end_nest_level = while_nest_level_[prefetch_end_time];
+  for (int64 prefetch_start_time = earliest_prefetch_start_time + 1;
+       prefetch_start_time <= latest_prefetch_start_time;
+       ++prefetch_start_time) {
+    float interval =
+        GetLogicalIntervalElapsed(prefetch_start_time, prefetch_end_time);
+    if (while_nest_level_[prefetch_start_time] == end_nest_level &&
+        std::abs(preferred_interval - interval) <
+            std::abs(preferred_interval - best_interval)) {
+      best_interval = interval;
+      preferred_prefetch_start_time = prefetch_start_time;
+    }
+  }
+  return preferred_prefetch_start_time;
+}
+
 int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
     int64 original_prefetch_end_time, int64 proposed_prefetch_end_time) const {
   // Iterate towards the beginning until we find a suitable end time that is the
@@ -422,7 +464,8 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
 
   // Find the latest time we're allowed to start prefetching.
   float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
-  latest_prefetch_time_ = LatestPrefetchStartTime(use, start_time, end_time);
+  latest_prefetch_time_ =
+      LatestPrefetchStartTime(shape, start_time, end_time, &use);
 
   // Find the earliest time we're allowed to start prefetching.
   float max_interval = max_async_copy_to_overlap_ratio_ *
@@ -443,24 +486,10 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
     return;
   }
 
-  // Between the earliest and latest prefetch interval, find the interval
-  // closest to the preferred interval and start iterating from there.
-  int64 starting_prefetch_time = earliest_prefetch_time_;
+  int64 starting_prefetch_time = PreferredPrefetchStartTime(
+      shape, earliest_prefetch_time_, latest_prefetch_time_, end_logical_time_);
   float preferred_interval =
       preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
-  float best_interval =
-      GetLogicalIntervalElapsed(earliest_prefetch_time_, end_logical_time_);
-  for (int64 prefetch_time = earliest_prefetch_time_ + 1;
-       prefetch_time <= latest_prefetch_time_; ++prefetch_time) {
-    float interval =
-        GetLogicalIntervalElapsed(prefetch_time, end_logical_time_);
-    if (while_nest_level_[prefetch_time] == end_nest_level &&
-        std::abs(preferred_interval - interval) <
-            std::abs(preferred_interval - best_interval)) {
-      best_interval = interval;
-      starting_prefetch_time = prefetch_time;
-    }
-  }
   VLOG(4) << "Interval min/max/preferred = " << min_interval << " "
           << max_interval << " " << preferred_interval
           << " prefetch time earliest/latest/starting = "
@@ -570,7 +599,8 @@ std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
 
 absl::optional<float>
 CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+    const {
   return cost_analysis_.GetMemoryBoundedness(interval);
 }
 
@@ -733,9 +763,9 @@ void AlternateMemoryBestFitHeap::FindAliases(
   }
 }
 
-std::vector<const GlobalDecreasingSizeBestFitHeap::BufferInterval*>
+std::vector<const AlternateMemoryBestFitHeap::BufferInterval*>
 AlternateMemoryBestFitHeap::GetSortedColocatedIntervals(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+    const AlternateMemoryBestFitHeap::BufferInterval& interval) const {
   std::vector<const BufferInterval*> colocated_intervals;
   std::vector<const BufferInterval*> worklist = {&interval};
   while (!worklist.empty()) {
@@ -864,7 +894,7 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
 }
 
 void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+    const AlternateMemoryBestFitHeap::BufferInterval& interval,
     std::string* debug_str) const {
   // Columns in buffer information:
   // buffer_id: int. This value can be used to match the allocation in
@@ -954,7 +984,7 @@ void AlternateMemoryBestFitHeap::DumpDebugStringsIfEnabled() const {
   options_.dump_fn("allocinfo", allocation_info_str_);
 }
 
-HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
+HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
 
@@ -1051,6 +1081,7 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
                                                  allocation_values);
 
     // Retry allocating this value with larger limits if allocation fails.
+    bool repacked = false;
     for (int retry_number = 0; retry_number < options_.max_retries;
          retry_number++) {
       bool final_retry = (retry_number == options_.max_retries - 1);
@@ -1064,11 +1095,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         UncommitPendingChunks(absl::MakeSpan(allocation_values));
         VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
       } else if (result_is(result, Result::kFailOutOfMemory) &&
-                 num_repacks_ < options_.max_repacks) {
+                 num_repacks_ < options_.max_repacks && !repacked) {
         UncommitPendingChunks(absl::MakeSpan(allocation_values));
         ++num_repacks_;
+        repacked = true;
         CHECK_NE(options_.repacker, nullptr);
-        std::vector<RepackAllocationBlock*> repack_allocation_blocks;
+        std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*>
+            repack_allocation_blocks;
         ExportAllocationsForRepacking(repack_allocation_blocks);
         VLOG(2) << "Repacking.";
         auto repack_status =
@@ -1076,7 +1109,7 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         CHECK_EQ(repack_status.status(), Status::OK());
         VLOG(2) << "Repack complete. Modified = " << *repack_status;
         if (*repack_status) {
-          ImportRepackedAllocations(absl::MakeSpan(repack_allocation_blocks));
+          ImportRepackedAllocations();
           --retry_number;
         }
       } else {
@@ -1367,21 +1400,80 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
   // Find the earliest use.
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   auto uses = buffer->uses();
-  auto first_use =
-      absl::c_min_element(uses, [&](const HloUse& lhs, const HloUse& rhs) {
-        return instruction_schedule.at(lhs.instruction) <
-               instruction_schedule.at(rhs.instruction);
-      });
+  auto use_schedule_compare = [&](const HloUse& lhs, const HloUse& rhs) {
+    return instruction_schedule.at(lhs.instruction) <
+           instruction_schedule.at(rhs.instruction);
+  };
+  auto first_use = absl::c_min_element(uses, use_schedule_compare);
   int64 latest_prefetch_time = instruction_schedule.at(first_use->instruction);
 
+  // Find the latest use time.
+  int64 last_use_time = instruction_schedule.at(
+      absl::c_max_element(uses, use_schedule_compare)->instruction);
+  for (const HloValue* colocation : prefetch_candidate->colocations) {
+    last_use_time = std::max(
+        last_use_time,
+        instruction_schedule.at(
+            absl::c_max_element(colocation->uses(), use_schedule_compare)
+                ->instruction));
+  }
+
+  int64 end_of_program_prefetch_end_time = instruction_schedule.size() - 1;
+  int64 end_of_program_prefetch_start_time =
+      options_.prefetch_interval_picker->PreferredPrefetchStartTime(
+          buffer->defining_position().shape(), last_use_time,
+          end_of_program_prefetch_end_time, end_of_program_prefetch_end_time);
+  VLOG(2) << "last use time = " << last_use_time
+          << ", end-of-program prefetch start time = "
+          << end_of_program_prefetch_start_time;
+  bool free_buffer =
+      (end_of_program_prefetch_start_time > last_use_time &&
+       end_of_program_prefetch_start_time < end_of_program_prefetch_end_time);
+  int64 cross_program_prefetch_end_time =
+      free_buffer ? last_use_time : prefetch_candidate->end;
+
   AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate,
                chunk_candidate.chunk, prefetch_candidate->start,
-               prefetch_candidate->end, latest_prefetch_time, &allocations);
+               cross_program_prefetch_end_time, latest_prefetch_time,
+               &allocations,
+               /*is_cross_program_prefetch=*/true);
   absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
+  int64 cross_program_prefetch_offset = allocations.back()->chunk().offset;
+
+  if (free_buffer) {
+    VLOG(2) << "Adding an end-of-program prefetch for freed "
+               "cross-program-prefetched buffer.";
+    AddAsyncCopy(*allocations.front(), MemorySpace::kAlternate,
+                 chunk_candidate.chunk, end_of_program_prefetch_start_time,
+                 end_of_program_prefetch_end_time,
+                 end_of_program_prefetch_end_time, &allocations);
+    CHECK_EQ(cross_program_prefetch_offset, allocations.back()->chunk().offset);
+  }
+
   for (auto& allocation : allocations) {
     allocations_->push_back(std::move(allocation));
   }
 
+  // Add a repack allocation block for the Allocation objects in alternate
+  // memory.
+  CHECK_EQ(repack_allocation_blocks_.size(), 0);
+  for (const auto& allocation : *allocations_) {
+    if (allocation->memory_space() == MemorySpace::kAlternate) {
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          allocation->start_time(), allocation->end_time(),
+          allocation->chunk().size, allocation->chunk().offset,
+          static_cast<int64>(repack_allocation_blocks_.size()),
+          allocation.get()));
+      RepackAllocationBlock* inserted = &repack_allocation_blocks_.back();
+      for (RepackAllocationBlock& colocation : repack_allocation_blocks_) {
+        colocation.colocations.push_back(inserted);
+        if (&colocation != inserted) {
+          inserted->colocations.push_back(&colocation);
+        }
+      }
+    }
+  }
+
   ClearPendingChunks();
 }
 
@@ -1560,29 +1652,27 @@ bool AlternateMemoryBestFitHeap::AreIntervalsReservedInAlternateMemory(
 }
 
 void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
-    std::vector<AlternateMemoryBestFitHeap::RepackAllocationBlock*>&
-        allocations) {
+    std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*>& allocations) {
   for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
     allocations.push_back(&allocation_block);
   }
 }
 
-void AlternateMemoryBestFitHeap::ImportRepackedAllocations(
-    absl::Span<AlternateMemoryBestFitHeap::RepackAllocationBlock*>
-        repacked_allocations) {
+void AlternateMemoryBestFitHeap::ImportRepackedAllocations() {
   interval_tree_ = {};
-  for (RepackAllocationBlock* allocation_block : repacked_allocations) {
-    MemorySpaceAssignment::Allocation* allocation = allocation_block->opaque;
+  for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
+    MemorySpaceAssignment::Allocation* allocation = allocation_block.allocation;
     VLOG(3) << "Moved " << allocation->ToString() << ", size "
-            << allocation->chunk().size << " from "
-            << allocation_block->initial_offset << " to "
-            << allocation_block->offset;
-    allocation_block->opaque->mutable_chunk()->offset =
-        allocation_block->offset;
-    interval_tree_.Add(allocation_block->start_time, allocation_block->end_time,
-                       {allocation_block->offset, allocation_block->size});
-    allocation_block->initial_offset = allocation_block->offset;
-    allocation_block->offset = -1;
+            << allocation->chunk().size << ", (" << allocation_block.start_time
+            << ", " << allocation_block.end_time << ") from "
+            << allocation_block.initial_offset << " to "
+            << allocation_block.offset;
+    allocation_block.allocation->mutable_chunk()->offset =
+        allocation_block.offset;
+    interval_tree_.Add(allocation_block.start_time, allocation_block.end_time,
+                       {allocation_block.offset, allocation_block.size});
+    allocation_block.initial_offset = allocation_block.offset;
+    allocation_block.offset = -1;
   }
 }
 
@@ -1655,17 +1745,19 @@ void AlternateMemoryBestFitHeap::FinalizeAllocations(
   // Export these to repack_allocation_blocks_ so that we can repack them to
   // reduce fragmentation.
   for (auto& colocation : colocation_map) {
-    std::vector<RepackAllocationBlock*> colocations;
+    std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*> colocations;
     for (MemorySpaceAssignment::Allocation* colocated_allocation :
          colocation.second) {
-      repack_allocation_blocks_.push_back(
-          {colocated_allocation->start_time(), colocated_allocation->end_time(),
-           colocated_allocation->chunk().size, /*offset=*/-1,
-           colocated_allocation->chunk().offset, /*colocations=*/{},
-           colocated_allocation});
+      repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
+          colocated_allocation->start_time(), colocated_allocation->end_time(),
+          colocated_allocation->chunk().size,
+          colocated_allocation->chunk().offset,
+          static_cast<int64>(repack_allocation_blocks_.size()),
+          colocated_allocation));
       colocations.push_back(&repack_allocation_blocks_.back());
     }
-    for (RepackAllocationBlock* repack_block : colocations) {
+    for (MemorySpaceAssignmentRepacker::AllocationBlock* repack_block :
+         colocations) {
       repack_block->colocations = colocations;
     }
   }
@@ -1842,7 +1934,8 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
     const MemorySpaceAssignment::Allocation& prev_allocation,
     MemorySpace memory_space, absl::optional<Chunk> chunk, int64 start_time,
     int64 end_time, int64 copy_done_schedule_before_time,
-    MemorySpaceAssignment::AllocationSequence* allocations) {
+    MemorySpaceAssignment::AllocationSequence* allocations,
+    bool is_cross_program_prefetch) {
   VLOG(3) << "Copy to "
           << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
                   ? "default"
@@ -1854,7 +1947,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
   allocations->push_back(
       absl::make_unique<MemorySpaceAssignment::CopyAllocation>(
           prev_allocation, memory_space, chunk, start_time, end_time,
-          copy_done_schedule_before_time));
+          copy_done_schedule_before_time, is_cross_program_prefetch));
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
@@ -2116,12 +2209,15 @@ int64 AlternateMemoryBestFitHeap::FindPrefetchEndTime(
     const AllocationRequest& request, int64 earliest_prefetch_time) const {
   int64 prefetch_end_time = request.latest_prefetch_time;
 
+  const HloUse& use = request.use->hlo_use;
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
   for (int retry_number = 0;
        retry_number < options_.prefetch_copy_done_reorder_max_retries;
        ++retry_number) {
     int64 latest_prefetch_time =
         options_.prefetch_interval_picker->LatestPrefetchStartTime(
-            request.use->hlo_use, earliest_prefetch_time, prefetch_end_time);
+            shape, earliest_prefetch_time, prefetch_end_time, &use);
     VLOG(4) << "Latest prefetch start time = " << latest_prefetch_time
             << ", earliest prefetch start time = " << earliest_prefetch_time
             << ", prefetch end time = " << prefetch_end_time;
@@ -2356,8 +2452,8 @@ MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
       return x_memory_boundedness > y_memory_boundedness;
     }
     // Tie-break if the memory boundedness is the same.
-    return GlobalDecreasingSizeBestFitHeap::GetSpatialBufferIntervalCompare()(
-        x, y);
+    return GlobalDecreasingSizeBestFitHeap<
+        HloValue>::GetSpatialBufferIntervalCompare()(x, y);
   };
 }
 
@@ -2428,7 +2524,9 @@ FindCrossProgramPrefetchCandidate(
     const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
     const MemorySpaceAssignment::Options& options) {
   std::vector<MemorySpaceAssignment::BufferInterval> candidates;
-  for (HloValue* value : alias_analysis.dataflow_analysis().values()) {
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    CHECK_GE(buffer.values().size(), 1);
+    const HloValue* value = buffer.values().at(0);
     if (IsCrossProgramPrefetchCandidate(*value, options)) {
       MemorySpaceAssignment::BufferInterval interval;
       interval.buffer = value;
@@ -2436,6 +2534,7 @@ FindCrossProgramPrefetchCandidate(
       interval.start = 0;
       interval.end = hlo_live_range.schedule_end_time();
       interval.need_allocation = true;
+      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
       candidates.emplace_back(interval);
     }
   }
@@ -2665,9 +2764,9 @@ Status MemorySpaceAssignment::CopyAllocation::Process(
   Shape shape = defining_position().shape();
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
-  copy_start_ = computation->AddInstruction(HloInstruction::CreateUnary(
+  copy_start_ = computation->AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({shape, shape, ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, producing_instruction));
+      producing_instruction, is_cross_program_prefetch_));
   copy_done_ = computation->AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
   VLOG(4) << "Created " << copy_start_->name()
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index d366c06a599..04737663424 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -106,7 +106,7 @@ class MemorySpaceAssignmentCostAnalysis {
   // BufferInterval.  The larger this number, the higher priority it will be
   // placed in the alternate memory.
   float GetMemoryBoundedness(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval,
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
       Cache* cache = nullptr) const;
 
   // Returns the elapsed time in seconds due to compute only.
@@ -200,8 +200,15 @@ class PrefetchIntervalPicker {
                                          int64 latest_end_time) const = 0;
 
   // Returns the latest time that a prefetch can start.
-  virtual int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
-                                        int64 end_time) const = 0;
+  virtual int64 LatestPrefetchStartTime(const Shape& shape, int64 start_time,
+                                        int64 end_time,
+                                        const HloUse* use) const = 0;
+
+  // Returns the preferred time that a prefetch can start.
+  virtual int64 PreferredPrefetchStartTime(const Shape& shape,
+                                           int64 earliest_prefetch_start_time,
+                                           int64 latest_prefetch_start_time,
+                                           int64 prefetch_end_time) const = 0;
 
   // Returns the latest time that a prefetch can end that is less than or equal
   // to proposed_prefetch_end_time.
@@ -235,7 +242,8 @@ class PrefetchIntervalPicker {
   // of placing the BufferInterval in the alternate memory. The larger value,
   // the more beneficial.
   virtual absl::optional<float> BufferIntervalAlternateMemoryBenefit(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+      const {
     return absl::nullopt;
   }
 
@@ -268,8 +276,14 @@ class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                  int64 latest_end_time) const override;
 
-  int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
-                                int64 end_time) const override;
+  int64 LatestPrefetchStartTime(const Shape& shape, int64 start_time,
+                                int64 end_time,
+                                const HloUse* use) const override;
+
+  int64 PreferredPrefetchStartTime(const Shape& shape,
+                                   int64 earliest_prefetch_start_time,
+                                   int64 latest_prefetch_start_time,
+                                   int64 prefetch_end_time) const override;
 
   void Begin(const HloUse& use, int64 start_time, int64 end_time) override;
 
@@ -307,11 +321,18 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 PreferredEvictionEndTime(const Shape& shape, int64 start_time,
                                  int64 latest_end_time) const override;
 
-  int64 LatestPrefetchStartTime(const HloUse& use, int64 start_time,
-                                int64 end_time) const override;
   int64 LatestPrefetchEndTime(int64 original_prefetch_end_time,
                               int64 proposed_prefetch_end_time) const override;
 
+  int64 LatestPrefetchStartTime(const Shape& shape, int64 start_time,
+                                int64 end_time,
+                                const HloUse* use) const override;
+
+  int64 PreferredPrefetchStartTime(const Shape& shape,
+                                   int64 earliest_prefetch_start_time,
+                                   int64 latest_prefetch_start_time,
+                                   int64 prefetch_end_time) const override;
+
   void Begin(const HloUse& use, int64 start_time, int64 end_time) override;
 
   int64 Next() override;
@@ -324,7 +345,7 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
                                   int64 end_time) const override;
 
   absl::optional<float> BufferIntervalAlternateMemoryBenefit(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval)
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
       const override;
 
  private:
@@ -370,9 +391,10 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
 class MemorySpaceAssignment {
  public:
   using Chunk = HeapSimulator::Chunk;
-  using BufferInterval = GlobalDecreasingSizeBestFitHeap::BufferInterval;
+  using BufferInterval =
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval;
   using BufferIntervalCompare =
-      GlobalDecreasingSizeBestFitHeap::BufferIntervalCompare;
+      GlobalDecreasingSizeBestFitHeap<HloValue>::BufferIntervalCompare;
   using IsAllowedInAlternateMemoryFunction =
       std::function<bool(const HloValue&)>;
 
@@ -435,7 +457,7 @@ class MemorySpaceAssignment {
 
     // The repacking algorithm to reduce fragmentation. Must be non-null if
     // max_repacks is greater than 0.
-    MemorySpaceAssignmentRepacker<Allocation*>* repacker = nullptr;
+    MemorySpaceAssignmentRepacker* repacker = nullptr;
 
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
@@ -559,12 +581,14 @@ class MemorySpaceAssignment {
    public:
     CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
                    absl::optional<Chunk> chunk, int64 start_time,
-                   int64 end_time, int64 copy_done_schedule_before_time)
+                   int64 end_time, int64 copy_done_schedule_before_time,
+                   bool is_cross_program_prefetch = false)
         : Allocation(/*defining_position=*/{nullptr, {}}, memory_space, chunk,
                      start_time, end_time),
           prev_allocation_(prev_allocation),
           copy_start_schedule_after_(start_time),
-          copy_done_schedule_before_(copy_done_schedule_before_time) {}
+          copy_done_schedule_before_(copy_done_schedule_before_time),
+          is_cross_program_prefetch_(is_cross_program_prefetch) {}
 
     bool is_copy_allocation() const override { return true; }
 
@@ -604,6 +628,10 @@ class MemorySpaceAssignment {
       copy_start_schedule_after_ = copy_start_schedule_after;
     }
 
+    bool is_cross_program_prefetch() const {
+      return is_cross_program_prefetch_;
+    }
+
     bool operator==(const CopyAllocation& other) const;
     std::string ToString() const override;
 
@@ -615,6 +643,7 @@ class MemorySpaceAssignment {
     // is before copy_done_schedule_before_.
     int64 copy_start_schedule_after_;
     int64 copy_done_schedule_before_;
+    bool is_cross_program_prefetch_;
     HloInstruction* copy_start_;
     HloInstruction* copy_done_;
   };
@@ -913,7 +942,8 @@ class AsynchronousCopyOrdering {
 
 // This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
 // maximum size.
-class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
+class AlternateMemoryBestFitHeap
+    : public GlobalDecreasingSizeBestFitHeap<HloValue> {
  public:
   using MemorySpace = MemorySpaceAssignment::MemorySpace;
   using AllocationValue = MemorySpaceAssignment::AllocationValue;
@@ -940,11 +970,15 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   void AllocateCrossProgramPrefetchBuffer(
       HloModule* module, absl::optional<BufferInterval> prefetch_candidate);
 
-  HeapSimulator::Result Finish() override;
+  HeapSimulator::Result<HloValue> Finish() override;
 
  private:
-  using RepackAllocationBlock = MemorySpaceAssignmentRepacker<
-      MemorySpaceAssignment::Allocation*>::AllocationBlock;
+  // We inherit AllocationBlock struct to attach the Allocation information to
+  // make importing repacked offsets easier.
+  struct RepackAllocationBlock
+      : MemorySpaceAssignmentRepacker::AllocationBlock {
+    MemorySpaceAssignment::Allocation* allocation;
+  };
 
   // An allocation request for a use segment. A use segment is the time segment
   // between the definition and the first use, and the time segment between the
@@ -1169,19 +1203,20 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // Exports the allocations for repacking and puts them into the vector in the
   // parameter.
   void ExportAllocationsForRepacking(
-      std::vector<RepackAllocationBlock*>& allocations);
+      std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*>&
+          allocations);
 
   // Imports repacked allocations and updates the internal data structures
   // consistent with the new packing.
-  void ImportRepackedAllocations(
-      absl::Span<RepackAllocationBlock*> repacked_allocations);
+  void ImportRepackedAllocations();
 
   // Adds an asynchronous copy to the allocations.
   void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
                     MemorySpace memory_space, absl::optional<Chunk> chunk,
                     int64 start_time, int64 end_time,
                     int64 copy_done_schedule_before_time,
-                    MemorySpaceAssignment::AllocationSequence* allocations);
+                    MemorySpaceAssignment::AllocationSequence* allocations,
+                    bool is_cross_program_prefetch = false);
 
   // This method is used for committing the chunk candidate but adding it to
   // pending_chunks_ so that we can "uncommit" them in case we need to roll back
@@ -1215,6 +1250,22 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
     return options_.max_size_in_bytes - reserved_in_bytes_;
   }
 
+  // Creates and returns a RepackAllocationBlock.
+  static RepackAllocationBlock MakeRepackAllocationBlock(
+      int64 start_time, int64 end_time, int64 size, int64 initial_offset,
+      int64 id, MemorySpaceAssignment::Allocation* allocation) {
+    RepackAllocationBlock allocation_block;
+    allocation_block.start_time = start_time;
+    allocation_block.end_time = end_time;
+    allocation_block.size = size;
+    allocation_block.offset = -1;
+    allocation_block.initial_offset = initial_offset;
+    allocation_block.id = id;
+    allocation_block.colocations = {};
+    allocation_block.allocation = allocation;
+    return allocation_block;
+  }
+
   MemorySpaceAssignment::AllocationSequence* allocations_;
   const MemorySpaceAssignment::Options& options_;
   const HloAliasAnalysis& alias_analysis_;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
new file mode 100644
index 00000000000..53b092f1939
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h"
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+
+namespace xla {
+
+namespace {
+
+using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
+using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
+
+// This class inherits GlobalDecreasingSizeBestFitHeap and converts
+// AllocationBlock objects into BufferIntervals that the heap algorithm
+// understands.
+class BestFitRepacker
+    : public GlobalDecreasingSizeBestFitHeap<AllocationBlock> {
+ public:
+  BestFitRepacker(int64 max_size, int64 alignment, Type type)
+      : GlobalDecreasingSizeBestFitHeap<AllocationBlock>(alignment, type),
+        max_size_(max_size) {}
+
+  void ImportAllocationBlocks(absl::Span<AllocationBlock*> allocations) {
+    allocation_blocks_ = allocations;
+    for (AllocationBlock* allocation_block : allocations) {
+      // Check if any of the colocations are already added to buffer_intervals_.
+      bool need_allocation = true;
+      auto aliased_it = absl::c_find_if(
+          allocation_block->colocations, [&](AllocationBlock* search) {
+            return buffer_intervals_.contains(search);
+          });
+      if (aliased_it != allocation_block->colocations.end()) {
+        buffer_intervals_[*aliased_it].colocations.push_back(allocation_block);
+        need_allocation = false;
+      }
+      buffer_intervals_[allocation_block] = {allocation_block,
+                                             allocation_block->size,
+                                             allocation_block->start_time,
+                                             allocation_block->end_time,
+                                             {},
+                                             need_allocation};
+    }
+  }
+
+  bool Repack() {
+    Finish();
+    bool success = result_.heap_size <= max_size_;
+    if (success) {
+      for (AllocationBlock* block : allocation_blocks_) {
+        auto chunk_it = result_.chunk_map.find(block);
+        if (chunk_it != result_.chunk_map.end()) {
+          block->offset = chunk_it->second.offset;
+        }
+      }
+    }
+    return success;
+  }
+
+ private:
+  int64 max_size_;
+  absl::Span<AllocationBlock*> allocation_blocks_;
+};
+
+}  // namespace
+
+StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
+    absl::Span<AllocationBlock*> allocations) {
+  BestFitRepacker best_fit_repacker =
+      BestFitRepacker(max_size_, alignment_, type_);
+  best_fit_repacker.ImportAllocationBlocks(allocations);
+  return best_fit_repacker.Repack();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h
new file mode 100644
index 00000000000..6937b8b0e8c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
+
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
+
+namespace xla {
+
+// This is a repacker algorithm that wraps around best fit heap algorithm in
+// heap simulator.
+class MemorySpaceAssignmentBestFitRepacker
+    : public MemorySpaceAssignmentRepacker {
+ public:
+  using Type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::Type;
+
+  explicit MemorySpaceAssignmentBestFitRepacker(
+      int64 max_size, int64 alignment,
+      Type type = GlobalDecreasingSizeBestFitHeap<AllocationBlock>::kTemporal)
+      : MemorySpaceAssignmentRepacker(max_size, alignment), type_(type) {}
+
+  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override;
+
+ private:
+  Type type_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_BEST_FIT_REPACKER_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker_test.cc
new file mode 100644
index 00000000000..44da2828eac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker_test.cc
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+class MemorySpaceAssignmentBestFitRepackerTest : public ::testing::Test {
+ protected:
+  using AllocationBlock = MemorySpaceAssignmentRepacker::AllocationBlock;
+
+  MemorySpaceAssignmentBestFitRepackerTest() : repacker_(100, 1) {}
+
+  AllocationBlock* MakeAllocationBlock(int64 start_time, int64 end_time,
+                                       int64 size, int64 initial_offset = -1) {
+    allocation_blocks_.push_back({start_time,
+                                  end_time,
+                                  size,
+                                  -1,
+                                  initial_offset,
+                                  static_cast<int64>(allocation_blocks_.size()),
+                                  {}});
+    AllocationBlock* block = &allocation_blocks_.back();
+    block->colocations.push_back(block);
+    return block;
+  }
+
+  std::list<AllocationBlock> allocation_blocks_;
+  MemorySpaceAssignmentBestFitRepacker repacker_;
+};
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, Simple) {
+  std::vector<AllocationBlock*> allocation_blocks;
+  allocation_blocks.push_back(MakeAllocationBlock(10, 20, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(5, 25, 15));
+  EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  EXPECT_EQ(allocation_blocks[0]->offset, 15);
+  EXPECT_EQ(allocation_blocks[1]->offset, 0);
+}
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, Colocation) {
+  std::vector<AllocationBlock*> allocation_blocks;
+  allocation_blocks.push_back(MakeAllocationBlock(0, 2, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(10, 20, 10));
+  // Allocation blocks 0 and 1 are colocated.
+  allocation_blocks[0]->colocations.push_back(allocation_blocks[1]);
+  allocation_blocks[1]->colocations.push_back(allocation_blocks[0]);
+  allocation_blocks.push_back(MakeAllocationBlock(5, 25, 15));
+  EXPECT_TRUE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  EXPECT_EQ(allocation_blocks[0]->offset, 15);
+  EXPECT_EQ(allocation_blocks[1]->offset, 15);
+  EXPECT_EQ(allocation_blocks[2]->offset, 0);
+}
+
+TEST_F(MemorySpaceAssignmentBestFitRepackerTest, TooLarge) {
+  // Memory size is 100, total size of buffers is 105.
+  std::vector<AllocationBlock*> allocation_blocks;
+  allocation_blocks.push_back(MakeAllocationBlock(10, 20, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(5, 25, 15));
+  allocation_blocks.push_back(MakeAllocationBlock(15, 20, 10));
+  allocation_blocks.push_back(MakeAllocationBlock(12, 22, 50));
+  allocation_blocks.push_back(MakeAllocationBlock(10, 18, 20));
+  EXPECT_FALSE(*repacker_.Repack(absl::MakeSpan(allocation_blocks)));
+
+  // Make sure the buffers didn't get offset assignments.
+  EXPECT_EQ(allocation_blocks[0]->offset, -1);
+  EXPECT_EQ(allocation_blocks[1]->offset, -1);
+  EXPECT_EQ(allocation_blocks[2]->offset, -1);
+  EXPECT_EQ(allocation_blocks[3]->offset, -1);
+  EXPECT_EQ(allocation_blocks[4]->offset, -1);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h b/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
index fcfdfc797fb..eb2f0698a95 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_repacking.h
@@ -22,10 +22,10 @@ limitations under the License.
 namespace xla {
 
 // An interface to define allocation repacking algorithms.
-template <typename O>
 class MemorySpaceAssignmentRepacker {
  public:
-  MemorySpaceAssignmentRepacker() = default;
+  MemorySpaceAssignmentRepacker(int64 max_size, int64 alignment)
+      : max_size_(max_size), alignment_(alignment) {}
   virtual ~MemorySpaceAssignmentRepacker() = default;
 
   // A contiguous block of allocation consisting of start and end (logical)
@@ -33,23 +33,36 @@ class MemorySpaceAssignmentRepacker {
   // successful and the allocations were modified, the offset field holds the
   // new offset. To support aliased allocations, AllocationBlock also includes a
   // vector of AllocationBlock pointers, called colocations. All AllocationBlock
-  // objects within the colocations must get the same offset. The opaque field
-  // is used by the MemorySpaceAssignment pass and should not be accessed by the
-  // repacking algorithm.
+  // objects within the colocations must get the same offset. The id should be
+  // unique and is used to ensure determinism for comparison tie-breaker.
   struct AllocationBlock {
     int64 start_time;
     int64 end_time;
     int64 size;
     int64 offset;
     int64 initial_offset;
+    int64 id;
     std::vector<AllocationBlock*> colocations;
-    O opaque;
+
+    std::string ToString() const {
+      return absl::StrCat("[", start_time, ", ", end_time, "] : size = ", size,
+                          ", offset = ", offset,
+                          " initial offset = ", initial_offset);
+    }
+
+    // This is required by BufferIntervalCompare as a tie breaker. Use a unique
+    // and deterministic id.
+    bool operator<(const AllocationBlock& other) const { return id < other.id; }
   };
 
   // Repack the AllocationBlocks provided in the parameter. Returns true if
   // allocations have been modified and false if not. Returns a non-ok status if
   // there was an error.
   virtual StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) = 0;
+
+ protected:
+  int64 max_size_;
+  int64 alignment_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 464cfb502be..cc4f740bc25 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -4069,12 +4069,12 @@ TEST_P(MemorySpaceAssignmentTest, MoveCopyDoneEarlier) {
 // A mock MemorySpaceAssignmentRepacker class that accepst a map of
 // (start_time,offset) -> new_offset values. Using this map, the repacker
 // repacks the allocations to the new_offset.
-class FakeMemorySpaceAssignmentRepacker
-    : public MemorySpaceAssignmentRepacker<MemorySpaceAssignment::Allocation*> {
+class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
  public:
-  FakeMemorySpaceAssignmentRepacker(
+  explicit FakeMemorySpaceAssignmentRepacker(
       absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map)
-      : repack_map_(repack_map) {}
+      : MemorySpaceAssignmentRepacker(/*max_size=*/128, /*alignment=*/8),
+        repack_map_(repack_map) {}
 
   StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
     bool modified = false;
@@ -4566,6 +4566,125 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTest) {
   EXPECT_EQ(cross_program_prefetches.size(), 0);
 }
 
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchReuse) {
+  // This test is for checking if the cross-program-prefetched buffer is freed
+  // after its last use and there is an end-of-program prefetch.
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true
+
+  ENTRY CrossProgramPrefetch {
+    p0 = (f32[8,8]{1,0}, f32[8,2]{1,0}) parameter(0)
+    get-tuple-element = f32[8,8]{1,0} get-tuple-element(p0), index=0
+    get-tuple-element.1 = f32[8,2]{1,0} get-tuple-element(p0), index=1
+    dot = f32[8,2]{1,0} dot(get-tuple-element, get-tuple-element.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    negate.1 = f32[8,2]{1,0} negate(dot)
+    negate.2 = f32[8,2]{1,0} negate(negate.1)
+    negate.3 = f32[8,2]{1,0} negate(negate.2)
+    negate.4 = f32[8,2]{1,0} negate(negate.3)
+    negate.5 = f32[8,2]{1,0} negate(negate.4)
+    negate.6 = f32[8,2]{1,0} negate(negate.5)
+    negate.7 = f32[8,2]{1,0} negate(negate.6)
+    negate.8 = f32[8,2]{1,0} negate(negate.7)
+    ROOT negate.9 = f32[8,2]{1,0} negate(negate.8)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].first, 0);
+    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+      HloDataflowAnalysis::Run(*module));
+  const HloValue& cross_program_prefetched_value =
+      dataflow_analysis->GetValueDefinedAt(
+          module->entry_computation()->parameter_instruction(0), {1});
+  // Expect that there are two prefetches that use this value, one is the
+  // cross-program prefetch, the other is the end-of-program prefetch.
+  auto is_cross_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_cross_program_prefetch),
+            1);
+  auto is_end_of_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           !use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_end_of_program_prefetch),
+            1);
+}
+
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchNoReuse) {
+  // This tests the scenario that the cross-program-prefetched buffer is used
+  // again close to the end of the computation. In this case, it is better not
+  // to free the buffer.
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true
+
+  ENTRY CrossProgramPrefetch {
+    p0 = (f32[8,8]{1,0}, f32[8,2]{1,0}) parameter(0)
+    get-tuple-element = f32[8,8]{1,0} get-tuple-element(p0), index=0
+    get-tuple-element.1 = f32[8,2]{1,0} get-tuple-element(p0), index=1
+    dot = f32[8,2]{1,0} dot(get-tuple-element, get-tuple-element.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    negate.1 = f32[8,2]{1,0} negate(dot)
+    negate.2 = f32[8,2]{1,0} negate(negate.1)
+    negate.3 = f32[8,2]{1,0} negate(negate.2)
+    negate.4 = f32[8,2]{1,0} negate(negate.3)
+    negate.5 = f32[8,2]{1,0} negate(negate.4)
+    negate.6 = f32[8,2]{1,0} negate(negate.5)
+    negate.7 = f32[8,2]{1,0} negate(negate.6)
+    negate.8 = f32[8,2]{1,0} negate(negate.7)
+    ROOT dot.2 = f32[2,2]{1,0} dot(negate.8, get-tuple-element.1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].first, 0);
+    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+      HloDataflowAnalysis::Run(*module));
+  const HloValue& cross_program_prefetched_value =
+      dataflow_analysis->GetValueDefinedAt(
+          module->entry_computation()->parameter_instruction(0), {1});
+  // Expect that there is one prefetch that use this value, the cross-program
+  // prefetch. There shouldn't be an end-of-program prefetch.
+  auto is_cross_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_cross_program_prefetch),
+            1);
+  auto is_end_of_program_prefetch = [](const HloUse& use) {
+    return use.instruction->opcode() == HloOpcode::kCopyStart &&
+           !use.instruction->is_cross_program_prefetch();
+  };
+  EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.uses(),
+                             is_end_of_program_prefetch),
+            0);
+}
+
 using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
 
 TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
@@ -4790,11 +4909,12 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  const Shape& shape = root->operand(1)->shape();
 
   // We expect the root's latest prefetch start time to be before the while loop
   // (logical time 4).
-  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(use, /*start_time=*/0,
-                                                    /*end_time=*/23),
+  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
+                                                    /*end_time=*/23, &use),
             4);
 }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
index 0215f007c9c..0c44ae0d766 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
@@ -17,21 +17,21 @@ limitations under the License.
 
 namespace xla {
 
-bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) {
+bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
+    const HloValue* value) {
   // If the buffer is a tuple, don't use this algorithm for now. The buffers
   // that are pointed to by the tuple will still use this algorithm.  Because
   // tuples are cheap to place in the alternate memory (they are just pointers)
   // we don't need to use prefetch/evict logic.
-  if (interval.buffer->shape().IsTuple()) {
-    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+  if (value->shape().IsTuple()) {
+    VLOG(4) << "Keeping value " << value->ToShortString()
             << " in default mem because it is a tuple.";
     return false;
   }
 
   // Don't place scalars in the alternate memory.
-  if (ShapeUtil::IsEffectiveScalar(interval.buffer->shape())) {
-    VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+  if (ShapeUtil::IsEffectiveScalar(value->shape())) {
+    VLOG(4) << "Keeping value " << value->ToShortString()
             << " in default mem because it is a scalar.";
     return false;
   }
@@ -44,10 +44,10 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
   // allocate TupleSelect in the alternate memory space.
   // TODO(berkin): Not allocating add-dependencies either since they need to be
   // treated specially. We should revisit this later.
-  for (const HloPosition& position : interval.buffer->positions()) {
+  for (const HloPosition& position : value->positions()) {
     if (position.instruction->opcode() == HloOpcode::kTupleSelect ||
         position.instruction->opcode() == HloOpcode::kAddDependency) {
-      VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+      VLOG(4) << "Keeping value " << value->ToShortString()
               << " in default mem because it has a tuple-select or "
               << "add-dependency position.";
       return false;
@@ -56,18 +56,18 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
 
   // Send and Recv HLOs return a request identifier. These should not be
   // allocated in the alternate memory.
-  for (const HloPosition& position : interval.buffer->positions()) {
+  for (const HloPosition& position : value->positions()) {
     if ((position.instruction->opcode() == HloOpcode::kSend ||
          position.instruction->opcode() == HloOpcode::kRecv)) {
       // TODO(berkin): Send/recv buffers need a stable buffer allocation
       // throughout sending/receiving. Disable memory space allocation for these
       // for now.
       if (position.index == ShapeIndex({0})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a send/recv buffer.";
         return false;
       } else if (position.index == ShapeIndex({1})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a request identifier for "
                    "send/recv.";
         return false;
@@ -78,11 +78,11 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
          position.instruction->opcode() == HloOpcode::kCollectivePermuteDone)) {
       // Disable memory space allocation for these for now.
       if (position.index == ShapeIndex({0})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a collective-permute buffer.";
         return false;
       } else if (position.index == ShapeIndex({1})) {
-        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+        VLOG(4) << "Keeping value " << value->ToShortString()
                 << " in default mem because it is a collective-permute buffer.";
         return false;
       }
@@ -92,4 +92,10 @@ bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
   return true;
 }
 
+bool MemorySpaceAssignmentUtils::IsIntervalAllowedInAlternateMemory(
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval) {
+  return IsValueAllowedInAlternateMemory(interval.buffer) &&
+         absl::c_all_of(interval.colocations, IsValueAllowedInAlternateMemory);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.h b/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
index 651ac107c25..082efa5eb64 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.h
@@ -26,7 +26,11 @@ class MemorySpaceAssignmentUtils {
   // Returns true if this buffer is allowed to be placed in the alternate
   // memory.
   static bool IsIntervalAllowedInAlternateMemory(
-      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval);
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval&
+          interval);
+
+  // Returns true if the HloValue is allowed to be placed in alternate memory.
+  static bool IsValueAllowedInAlternateMemory(const HloValue* value);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 31cf36dee85..68bcde4f7ee 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -149,6 +149,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Core",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
index ca979262df0..cb5ea946c1b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
@@ -25,6 +25,7 @@ namespace mlir_gpu {
 
 EmissionContext::EmissionContext(std::unique_ptr<HloModule> module)
     : module_(std::move(module)), context_() {
+  context_.loadAllGloballyRegisteredDialects();
   error_handler_ = [](const ErrorMap& instructions_with_error,
                       HloModule* module) {
     std::set<const HloComputation*> computations_with_error;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index d5cad385324..f7a7decff76 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -46,6 +46,7 @@ std::string CompileHloConvAndGetMlir(absl::string_view hlo_text) {
       hlo_module.entry_computation()->root_instruction();
 
   mlir::MLIRContext context;
+  context.loadAllGloballyRegisteredDialects();
   mlir::OwningModuleRef mlir_module(
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context)));
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index e0d7456fbb8..b275dd4525f 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "llvm/IR/DataLayout.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -203,9 +204,13 @@ LhloDialectEmitter::LhloDialectEmitter(
       builder_(mlir_module_.getContext()),
       buffer_assignment_(assignment),
       platform_(platform) {
-  LLVMDialect* llvmDialect =
-      mlir_module.getContext()->getRegisteredDialect<LLVMDialect>();
-  pointer_size_ = llvmDialect->getDataLayout().getPointerSize();
+  llvm::DataLayout data_layout("");
+  if (auto data_layout_attr = mlir_module.getAttrOfType<mlir::StringAttr>(
+          mlir::LLVM::LLVMDialect::getDataLayoutAttrName())) {
+    data_layout.reset(data_layout_attr.getValue());
+  }
+
+  pointer_size_ = data_layout.getPointerSize();
 }
 
 void LhloDialectEmitter::AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
index df2bd2e4c23..26c9e155c0c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc
@@ -25,19 +25,8 @@ limitations under the License.
 
 namespace xla {
 namespace mlir_gpu {
-namespace {
 
-using ::mlir::MLIRContext;
-using ::mlir::LLVM::LLVMDialect;
-
-int64 GetPointerSize(MLIRContext* context) {
-  LLVMDialect* dialect = context->getRegisteredDialect<LLVMDialect>();
-  return dialect->getDataLayout().getPointerSize();
-}
-
-}  // namespace
-
-MlirCompiler::MlirCompiler() : pointer_size_(GetPointerSize(&context_)) {}
+MlirCompiler::MlirCompiler() : data_layout_("") {}
 
 se::Platform::Id MlirCompiler::PlatformId() const {
   return stream_executor::cuda::kCudaPlatformId;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
index a7b2f9446fa..261e249c0a1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_MLIR_COMPILER_H_
 
+#include "llvm/IR/DataLayout.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -58,7 +59,7 @@ class MlirCompiler : public Compiler {
 
  protected:
   ::mlir::MLIRContext context_;
-  int64 pointer_size_;
+  llvm::DataLayout data_layout_;
   IRHook module_hook_;
   ErrorHandler error_handler_;
 };
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 4879c6b5099..c7977aa776a 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -104,7 +104,7 @@ class MlirCompilerImpl : public MlirCompiler {
                      const AotCompilationOptions& options) override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override {
-    int64 pointer_size = pointer_size_;
+    int64 pointer_size = data_layout_.getPointerSize();
     return [pointer_size](const Shape& shape) {
       return ShapeUtil::ByteSizeOf(shape, pointer_size);
     };
@@ -462,9 +462,9 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   // must also be used to determine the thunk launch schedule.
   std::unique_ptr<StreamAssignment> stream_assignment =
       xla::gpu::AssignStreams(*module);
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GpuHloSchedule> hlo_schedule,
-      GpuHloSchedule::Build(*module, *stream_assignment, pointer_size_));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<GpuHloSchedule> hlo_schedule,
+                      GpuHloSchedule::Build(*module, *stream_assignment,
+                                            data_layout_.getPointerSize()));
 
   // Run buffer analysis on the HLO graph. This analysis figures out which
   // temporary buffers are required to run the computation.
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index febbf9294b0..eb29fa89098 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -351,8 +351,7 @@ class AllOfPattern {
 // Returns a pattern that represents the conjunction of all input patterns. All
 // patterns need to match in order to have the AllOf pattern match.
 template <typename Item, typename... Patterns>
-detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
-    const Patterns&... patterns) {
+auto AllOf(const Patterns&... patterns) {
   return detail::AllOfPattern<typename std::remove_const<Item>::type,
                               Patterns...>(patterns...);
 }
@@ -361,10 +360,8 @@ detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
 //
 // This transformation is necessary for good pretty-printing.
 template <typename Item, typename... InnerPs, typename... OuterPs>
-detail::AllOfPattern<typename std::remove_const<Item>::type, InnerPs...,
-                     OuterPs...>
-AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
-      const OuterPs&... outer_ps) {
+auto AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
+           const OuterPs&... outer_ps) {
   // Invoke constructor of AllOfPattern<Item, InnerPs..., OuterPs...>.
   auto make_all_of = [](const InnerPs&... inner_ps,
                         const OuterPs&... outer_ps) {
@@ -453,10 +450,7 @@ template <typename LayoutType, typename Impl>
 class LayoutPattern {
  private:
   template <typename NewImpl>
-  auto AppendImpl(NewImpl new_impl) const
-      -> LayoutPattern<LayoutType,
-                       decltype(AllOf<::xla::Layout>(std::declval<Impl>(),
-                                                     std::move(new_impl)))> {
+  auto AppendImpl(NewImpl new_impl) const {
     auto new_allof = AllOf<::xla::Layout>(impl_, std::move(new_impl));
     return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
                                                           matched_layout_);
@@ -495,14 +489,12 @@ class LayoutPattern {
 
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
-  constexpr auto EqualTo(const ::xla::Layout* layout) const
-      -> decltype(this->AppendImpl(LayoutPatternEqualImpl(layout))) {
+  constexpr auto EqualTo(const ::xla::Layout* layout) const {
     return AppendImpl(LayoutPatternEqualImpl(layout));
   }
 
   // Modifies the pattern to match only if the layout has a dense format.
-  constexpr auto WithDenseFormat() const
-      -> decltype(this->AppendImpl(LayoutPatternFormatImpl(DENSE))) {
+  constexpr auto WithDenseFormat() const {
     return AppendImpl(LayoutPatternFormatImpl(DENSE));
   }
 
@@ -626,17 +618,14 @@ class AnyOfPattern {
 // patterns. The returned pattern matches from left to right, and stops on the
 // first match.
 template <typename Item, typename... Patterns>
-detail::AnyOfPattern<typename std::remove_const<Item>::type, Patterns...> AnyOf(
-    const Patterns&... patterns) {
+auto AnyOf(const Patterns&... patterns) {
   return detail::AnyOfPattern<typename std::remove_const<Item>::type,
                               Patterns...>(patterns...);
 }
 
 // Creates a layout pattern that will capture the matched layout in the
 // argument.
-inline constexpr detail::LayoutPattern<const ::xla::Layout,
-                                       detail::LayoutPatternBaseImpl>
-Layout(const ::xla::Layout** matched_layout = nullptr) {
+inline constexpr auto Layout(const ::xla::Layout** matched_layout = nullptr) {
   return detail::LayoutPattern<const ::xla::Layout,
                                detail::LayoutPatternBaseImpl>(
       detail::LayoutPatternBaseImpl(), matched_layout);
@@ -644,9 +633,7 @@ Layout(const ::xla::Layout** matched_layout = nullptr) {
 
 // Creates a layout pattern that will capture the matched layout in the
 // argument.
-inline constexpr detail::LayoutPattern<::xla::Layout,
-                                       detail::LayoutPatternBaseImpl>
-Layout(::xla::Layout** matched_layout) {
+inline constexpr auto Layout(::xla::Layout** matched_layout) {
   return detail::LayoutPattern<::xla::Layout, detail::LayoutPatternBaseImpl>(
       detail::LayoutPatternBaseImpl(), matched_layout);
 }
@@ -939,10 +926,7 @@ template <typename ShapeType, typename Impl>
 class ShapePattern {
  private:
   template <typename NewImpl>
-  auto AppendImpl(NewImpl new_impl) const
-      -> ShapePattern<ShapeType,
-                      decltype(AllOf<::xla::Shape>(std::declval<Impl>(),
-                                                   std::move(new_impl)))> {
+  auto AppendImpl(NewImpl new_impl) const {
     auto new_all_of = AllOf<::xla::Shape>(impl_, std::move(new_impl));
     return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
                                                          matched_shape_);
@@ -988,80 +972,66 @@ class ShapePattern {
 
   // Modifies the pattern to match only if the shape equals the given proto.
   // The layout must outlive the returned pattern.
-  constexpr auto EqualTo(const ::xla::Shape* shape) const
-      -> decltype(this->AppendImpl(ShapePatternEqualImpl(shape))) {
+  constexpr auto EqualTo(const ::xla::Shape* shape) const {
     return AppendImpl(ShapePatternEqualImpl(shape));
   }
 
   // Modifies the pattern to match only if the shape is compatible to the given
   // proto. The layout must outlive the returned pattern.
-  constexpr auto CompatibleTo(const ::xla::Shape* shape) const
-      -> decltype(this->AppendImpl(ShapePatternCompatibleImpl(shape))) {
+  constexpr auto CompatibleTo(const ::xla::Shape* shape) const {
     return AppendImpl(ShapePatternCompatibleImpl(shape));
   }
 
   // Modifies the pattern to match only if the shape has the given element type.
-  constexpr auto WithElementType(PrimitiveType element_type) const
-      -> decltype(this->AppendImpl(ShapePatternElementTypeImpl(element_type))) {
+  constexpr auto WithElementType(PrimitiveType element_type) const {
     return AppendImpl(ShapePatternElementTypeImpl(element_type));
   }
 
   // Modifies the pattern to match only if the shape is scalar.
-  constexpr auto IsScalar() const
-      -> decltype(this->AppendImpl(ShapePatternIsScalarImpl())) {
+  constexpr auto IsScalar() const {
     return AppendImpl(ShapePatternIsScalarImpl());
   }
 
   // Modifies the pattern to match only if the shape is an array.
-  constexpr auto IsArray() const
-      -> decltype(this->AppendImpl(ShapePatternIsArrayImpl())) {
+  constexpr auto IsArray() const {
     return AppendImpl(ShapePatternIsArrayImpl());
   }
 
   // Modifies the pattern to match only if the shape is a tuple.
-  constexpr auto IsTuple() const
-      -> decltype(this->AppendImpl(ShapePatternIsTupleImpl())) {
+  constexpr auto IsTuple() const {
     return AppendImpl(ShapePatternIsTupleImpl());
   }
 
-  constexpr auto IsEffectiveScalar() const
-      -> decltype(this->AppendImpl(ShapePatternEffectiveScalarImpl())) {
+  constexpr auto IsEffectiveScalar() const {
     return AppendImpl(ShapePatternEffectiveScalarImpl());
   }
 
   // Modifies the pattern to match only if the shape has the given rank.
-  constexpr auto WithRank(int64 rank) const
-      -> decltype(this->AppendImpl(ShapePatternRankImpl(rank))) {
+  constexpr auto WithRank(int64 rank) const {
     return AppendImpl(ShapePatternRankImpl(rank));
   }
 
   // Modifies the pattern to match only if the shape has a layout that matches
   // the given pattern.
   template <typename LayoutType, typename LayoutImpl>
-  auto WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const
-      -> decltype(this->AppendImpl(
-          ShapePatternLayoutImpl<LayoutType, LayoutImpl>(layout))) {
+  auto WithLayout(const LayoutPattern<LayoutType, LayoutImpl>& layout) const {
     return AppendImpl(ShapePatternLayoutImpl<LayoutType, LayoutImpl>(layout));
   }
 
-  constexpr auto WithLayoutEqualTo(const ::xla::Layout* layout) const
-      -> decltype(this->WithLayout(Layout().EqualTo(layout))) {
+  constexpr auto WithLayoutEqualTo(const ::xla::Layout* layout) const {
     return WithLayout(Layout().EqualTo(layout));
   }
 
-  constexpr auto IsDenseArray() const
-      -> decltype(this->WithLayout(Layout().WithDenseFormat())) {
+  constexpr auto IsDenseArray() const {
     return WithLayout(Layout().WithDenseFormat());
   }
 
   // Modifies the pattern to match only if the shape has a subshape that matches
   // the given pattern.
   template <typename SubshapeType, typename SubshapeImpl>
-  auto WithSubshape(ShapeIndexView index,
-                    const ShapePattern<SubshapeType, SubshapeImpl>& subshape)
-      const -> decltype(this->AppendImpl(
-          ShapePatternSubshapeImpl<SubshapeType, SubshapeImpl>(index,
-                                                               subshape))) {
+  auto WithSubshape(
+      ShapeIndexView index,
+      const ShapePattern<SubshapeType, SubshapeImpl>& subshape) const {
     return AppendImpl(
         ShapePatternSubshapeImpl<SubshapeType, SubshapeImpl>(index, subshape));
   }
@@ -1101,17 +1071,13 @@ class ShapePattern {
 }  // namespace detail
 
 // Creates a shape pattern that will capture the matched layout in the argument.
-inline constexpr detail::ShapePattern<const ::xla::Shape,
-                                      detail::ShapePatternBaseImpl>
-Shape(const ::xla::Shape** matched_shape = nullptr) {
+inline constexpr auto Shape(const ::xla::Shape** matched_shape = nullptr) {
   return detail::ShapePattern<const ::xla::Shape, detail::ShapePatternBaseImpl>(
       detail::ShapePatternBaseImpl(), matched_shape);
 }
 
 // Creates a shape pattern that will capture the matched layout in the argument.
-inline constexpr detail::ShapePattern<::xla::Shape,
-                                      detail::ShapePatternBaseImpl>
-Shape(::xla::Shape** matched_shape) {
+inline constexpr auto Shape(::xla::Shape** matched_shape) {
   return detail::ShapePattern<::xla::Shape, detail::ShapePatternBaseImpl>(
       detail::ShapePatternBaseImpl(), matched_shape);
 }
@@ -1797,9 +1763,7 @@ template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
  private:
   template <typename NewImpl>
-  auto AppendImpl(NewImpl new_impl) const -> HloInstructionPattern<
-      HloInstructionType, decltype(AllOf<::xla::HloInstruction>(
-                              std::declval<Impl>(), std::move(new_impl)))> {
+  auto AppendImpl(NewImpl new_impl) const {
     auto new_allof = AllOf<::xla::HloInstruction>(impl_, std::move(new_impl));
     return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
         std::move(new_allof), matched_inst_);
@@ -1837,51 +1801,38 @@ class HloInstructionPattern {
   }
 
   // Modifies the pattern to match only if the instruction has the given name.
-  auto WithName(absl::string_view name) const
-      -> decltype(this->AppendImpl(HloInstructionPatternNameImpl(name))) {
+  auto WithName(absl::string_view name) const {
     return AppendImpl(HloInstructionPatternNameImpl(name));
   }
 
   // Modifies the pattern to match only if the instruction has the given opcode.
-  auto WithOpcode(HloOpcode opcode) const
-      -> decltype(this->AppendImpl(HloInstructionPatternOpcodeImpl(opcode,
-                                                                   false))) {
+  auto WithOpcode(HloOpcode opcode) const {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, false));
   }
 
   // Modifies the pattern to match only the custom call with a given target.
-  auto WithCustomCallTarget(absl::string_view custom_call_target) const
-      -> decltype(this->AppendImpl(
-          HloInstructionCustomCallTargetImpl(custom_call_target))) {
+  auto WithCustomCallTarget(absl::string_view custom_call_target) const {
     return AppendImpl(HloInstructionCustomCallTargetImpl(custom_call_target));
   }
 
-  auto WithNumOperands(int64 num_operands) const -> decltype(
-      this->AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands))) {
+  auto WithNumOperands(int64 num_operands) const {
     return AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands));
   }
 
   // Modifies the pattern to match only if the instruction does not have the
   // given opcode.
-  auto WithoutOpcode(HloOpcode opcode) const
-      -> decltype(this->AppendImpl(HloInstructionPatternOpcodeImpl(opcode,
-                                                                   true))) {
+  auto WithoutOpcode(HloOpcode opcode) const {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, true));
   }
 
-  constexpr auto Is(const HloInstruction* instr) const
-      -> decltype(this->AppendImpl(HloInstructionIsImpl(instr))) {
+  constexpr auto Is(const HloInstruction* instr) const {
     return AppendImpl(HloInstructionIsImpl(instr));
   }
 
   // Modifies the pattern to match only if the instruction is a constant.
-  constexpr auto IsConstant() const
-      -> decltype(this->WithOpcode(HloOpcode::kConstant)) {
-    return WithOpcode(HloOpcode::kConstant);
-  }
+  constexpr auto IsConstant() const { return WithOpcode(HloOpcode::kConstant); }
 
-  constexpr auto IsConstantScalar() const -> decltype(this->AppendImpl(
-      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false))) {
+  constexpr auto IsConstantScalar() const {
     return AppendImpl(
         HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false));
   }
@@ -1889,39 +1840,32 @@ class HloInstructionPattern {
   // This does not check that T has the same type as the instruction, so e.g.
   // IsConstantScalar(1.0) may match a constant of shape int32[].
   template <typename ScalarTy>
-  constexpr auto IsConstantScalar(const ScalarTy& val) const
-      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
-          val, /*match_effective_scalar=*/false))) {
+  constexpr auto IsConstantScalar(const ScalarTy& val) const {
     return AppendImpl(
         HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/false));
   }
 
-  constexpr auto IsConstantEffectiveScalar() const -> decltype(this->AppendImpl(
-      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true))) {
+  constexpr auto IsConstantEffectiveScalar() const {
     return AppendImpl(
         HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true));
   }
 
   template <typename ScalarTy>
-  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const
-      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
-          val, /*match_effective_scalar=*/true))) {
+  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const {
     return AppendImpl(
         HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/true));
   }
 
   // Modifies the pattern to match only if the instruction is not a constant.
-  constexpr auto IsNonConstant() const
-      -> decltype(this->WithoutOpcode(HloOpcode::kConstant)) {
+  constexpr auto IsNonConstant() const {
     return WithoutOpcode(HloOpcode::kConstant);
   }
 
   // Modifies the pattern to match only if the instruction has a shape that
   // matches the given pattern.
   template <typename ShapeType, typename ShapeImpl>
-  constexpr auto WithShape(const ShapePattern<ShapeType, ShapeImpl>& shape)
-      const -> decltype(this->AppendImpl(
-          HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape))) {
+  constexpr auto WithShape(
+      const ShapePattern<ShapeType, ShapeImpl>& shape) const {
     return AppendImpl(
         HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape));
   }
@@ -1929,16 +1873,14 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const
-      -> decltype(this->WithShape(Shape().EqualTo(shape))) {
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const {
     return WithShape(Shape().EqualTo(shape));
   }
 
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const
-      -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const {
     return WithShape(Shape().CompatibleTo(shape));
   }
 
@@ -1947,10 +1889,7 @@ class HloInstructionPattern {
   template <typename OperandType, typename OperandImpl>
   constexpr auto WithOperand(
       int64 operand_index,
-      const HloInstructionPattern<OperandType, OperandImpl>& operand) const
-      -> decltype(this->AppendImpl(
-          HloInstructionPatternOperandImpl<OperandType, OperandImpl>(
-              operand_index, operand))) {
+      const HloInstructionPattern<OperandType, OperandImpl>& operand) const {
     return AppendImpl(
         HloInstructionPatternOperandImpl<OperandType, OperandImpl>(
             operand_index, operand));
@@ -1960,11 +1899,7 @@ class HloInstructionPattern {
             typename OperandImpl2>
   constexpr auto WithBinaryOperandsAnyOrder(
       const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
-      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const
-      -> decltype(this->AppendImpl(
-          HloInstructionPatternBinaryOperandsAnyOrderImpl<
-              OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1,
-                                                                      op2))) {
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const {
     return AppendImpl(
         HloInstructionPatternBinaryOperandsAnyOrderImpl<
             OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1, op2));
@@ -1972,46 +1907,39 @@ class HloInstructionPattern {
 
   // Modifies the pattern to match only if the instruction is a fusion node with
   // the given kind.
-  constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const
-      -> decltype(this->AppendImpl(HloInstructionPatternFusionKindImpl(kind))) {
+  constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const {
     return AppendImpl(HloInstructionPatternFusionKindImpl(kind));
   }
 
   // Modifies the pattern to match only if the instruction is a
   // get-tuple-element with the given tuple index.
-  constexpr auto WithTupleIndex(int64 tuple_index) const -> decltype(
-      this->AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index))) {
+  constexpr auto WithTupleIndex(int64 tuple_index) const {
     return AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index));
   }
 
   // Modifies the pattern to match only if the instruction is a parameter
   // with the given parameter number.
-  constexpr auto WithParameterNum(int64 parameter_num) const -> decltype(
-      this->AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num))) {
+  constexpr auto WithParameterNum(int64 parameter_num) const {
     return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
   }
 
   // Modifies the pattern to match if the instruction is used exactly once.
   // Does not match if the instruction is used twice by the same user (e.g.
   // multiply(x,x)).
-  constexpr auto WithOneUse() const
-      -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) {
+  constexpr auto WithOneUse() const {
     return AppendImpl(HloInstructionPatternOneUseImpl());
   }
 
   // Modifies the pattern to match if the instruction is used by exactly one
   // other instruction.  Will match if the instruction is used twice, so long as
   // it's by the same user (e.g.  multiply(x,x)).
-  constexpr auto WithOneUser() const
-      -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) {
+  constexpr auto WithOneUser() const {
     return AppendImpl(HloInstructionPatternOneUserImpl());
   }
 
   // Modifies the pattern to match only if the instruction has the given
   // comparison direction.
-  auto WithComparisonDirection(ComparisonDirection direction) const
-      -> decltype(this->AppendImpl(
-          HloInstructionPatternComparisonDirectionImpl(direction))) {
+  auto WithComparisonDirection(ComparisonDirection direction) const {
     return AppendImpl(HloInstructionPatternComparisonDirectionImpl(direction));
   }
 
@@ -2028,9 +1956,7 @@ class HloInstructionPattern {
 
 // Creates an instruction pattern that will capture the matched instruction in
 // the argument.
-inline constexpr detail::HloInstructionPattern<
-    const ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
-Op(const ::xla::HloInstruction** matched_inst = nullptr) {
+inline constexpr auto Op(const ::xla::HloInstruction** matched_inst = nullptr) {
   return detail::HloInstructionPattern<const ::xla::HloInstruction,
                                        detail::HloInstructionPatternBaseImpl>(
       detail::HloInstructionPatternBaseImpl(), matched_inst);
@@ -2038,24 +1964,19 @@ Op(const ::xla::HloInstruction** matched_inst = nullptr) {
 
 // Creates an instruction pattern that will capture the matched instruction in
 // the argument.
-inline constexpr detail::HloInstructionPattern<
-    ::xla::HloInstruction, detail::HloInstructionPatternBaseImpl>
-Op(::xla::HloInstruction** matched_inst) {
+inline constexpr auto Op(::xla::HloInstruction** matched_inst) {
   return detail::HloInstructionPattern<::xla::HloInstruction,
                                        detail::HloInstructionPatternBaseImpl>(
       detail::HloInstructionPatternBaseImpl(), matched_inst);
 }
 
 // Helpers for nullary instructions.
-#define XLA_NULLOP_PATTERN(NAME)                                      \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
-    return Op().WithOpcode(HloOpcode::k##NAME);                       \
-  }                                                                   \
-                                                                      \
-  template <typename HloInstructionType>                              \
-  inline auto NAME(HloInstructionType** matched_inst)                 \
-      ->decltype(Op(matched_inst).WithOpcode(HloOpcode::k##NAME)) {   \
-    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);           \
+#define XLA_NULLOP_PATTERN(NAME)                                     \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
+                                                                     \
+  template <typename HloInstructionType>                             \
+  inline auto NAME(HloInstructionType** matched_inst) {              \
+    return Op(matched_inst).WithOpcode(HloOpcode::k##NAME);          \
   }
 XLA_NULLOP_PATTERN(Constant)
 XLA_NULLOP_PATTERN(Parameter)
@@ -2064,28 +1985,21 @@ XLA_NULLOP_PATTERN(Rng)
 #undef XLA_NULLOP_PATTERN
 
 // Helpers for unary instructions.
-#define XLA_UNOP_PATTERN(NAME)                                        \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) { \
-    return Op().WithOpcode(HloOpcode::k##NAME);                       \
-  }                                                                   \
-                                                                      \
-  template <typename Arg>                                             \
-  inline auto NAME(Arg&& arg)->decltype(                              \
-      Op().WithOpcode(HloOpcode::k##NAME)                             \
-          .WithOperand(0, std::forward<Arg>(arg))) {                  \
-    return Op()                                                       \
-        .WithOpcode(HloOpcode::k##NAME)                               \
-        .WithOperand(0, std::forward<Arg>(arg));                      \
-  }                                                                   \
-                                                                      \
-  template <typename HloInstructionType, typename Arg>                \
-  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg)      \
-      ->decltype(Op(matched_inst)                                     \
-                     .WithOpcode(HloOpcode::k##NAME)                  \
-                     .WithOperand(0, std::forward<Arg>(arg))) {       \
-    return Op(matched_inst)                                           \
-        .WithOpcode(HloOpcode::k##NAME)                               \
-        .WithOperand(0, std::forward<Arg>(arg));                      \
+#define XLA_UNOP_PATTERN(NAME)                                       \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
+                                                                     \
+  template <typename Arg>                                            \
+  inline auto NAME(Arg&& arg) {                                      \
+    return Op()                                                      \
+        .WithOpcode(HloOpcode::k##NAME)                              \
+        .WithOperand(0, std::forward<Arg>(arg));                     \
+  }                                                                  \
+                                                                     \
+  template <typename HloInstructionType, typename Arg>               \
+  inline auto NAME(HloInstructionType** matched_inst, Arg&& arg) {   \
+    return Op(matched_inst)                                          \
+        .WithOpcode(HloOpcode::k##NAME)                              \
+        .WithOperand(0, std::forward<Arg>(arg));                     \
   }
 XLA_UNOP_PATTERN(Abs)
 XLA_UNOP_PATTERN(RoundNearestAfz)
@@ -2124,55 +2038,40 @@ XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
 
 // Helpers for binary instructions.
-#define XLA_BINOP_PATTERN(NAME)                                             \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {       \
-    return Op().WithOpcode(HloOpcode::k##NAME);                             \
-  }                                                                         \
-                                                                            \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                    \
-      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                        \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                \
-                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
-    return Op()                                                             \
-        .WithOpcode(HloOpcode::k##NAME)                                     \
-        .WithOperand(0, std::forward<Lhs>(lhs))                             \
-        .WithOperand(1, std::forward<Rhs>(rhs));                            \
-  }                                                                         \
-                                                                            \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
-  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) \
-      ->decltype(Op(matched_inst)                                           \
-                     .WithOpcode(HloOpcode::k##NAME)                        \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                \
-                     .WithOperand(1, std::forward<Rhs>(rhs))) {             \
-    return Op(matched_inst)                                                 \
-        .WithOpcode(HloOpcode::k##NAME)                                     \
-        .WithOperand(0, std::forward<Lhs>(lhs))                             \
-        .WithOperand(1, std::forward<Rhs>(rhs));                            \
+#define XLA_BINOP_PATTERN(NAME)                                               \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }          \
+                                                                              \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::k##NAME)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs));                              \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return Op(matched_inst)                                                   \
+        .WithOpcode(HloOpcode::k##NAME)                                       \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs));                              \
   }
 
-#define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                 \
-  XLA_BINOP_PATTERN(NAME)                                                   \
-                                                                            \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
-  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
-                             Rhs&& rhs)                                     \
-      ->decltype(Op(matched_inst)                                           \
-                     .WithOpcode(HloOpcode::k##NAME)                        \
-                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
-                                                 std::forward<Rhs>(rhs))) { \
-    return Op(matched_inst)                                                 \
-        .WithOpcode(HloOpcode::k##NAME)                                     \
-        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
-                                    std::forward<Rhs>(rhs));                \
-  }                                                                         \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
-          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
-    return NAME##AnyOrder<const HloInstruction>(                            \
-        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
+#define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                \
+  XLA_BINOP_PATTERN(NAME)                                                  \
+                                                                           \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>       \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs, \
+                             Rhs&& rhs) {                                  \
+    return Op(matched_inst)                                                \
+        .WithOpcode(HloOpcode::k##NAME)                                    \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                \
+                                    std::forward<Rhs>(rhs));               \
+  }                                                                        \
+  template <typename Lhs, typename Rhs>                                    \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs) {                       \
+    return NAME##AnyOrder<const HloInstruction>(                           \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));          \
   }
 XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
@@ -2202,16 +2101,10 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
 
 // Helpers for ternary instructions.
 #define XLA_TERNOP_PATTERN(NAME)                                       \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {  \
-    return Op().WithOpcode(HloOpcode::k##NAME);                        \
-  }                                                                    \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }   \
                                                                        \
   template <typename Arg0, typename Arg1, typename Arg2>               \
-  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2)              \
-      ->decltype(Op().WithOpcode(HloOpcode::k##NAME)                   \
-                     .WithOperand(0, std::forward<Arg0>(arg0))         \
-                     .WithOperand(1, std::forward<Arg1>(arg1))         \
-                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+  inline auto NAME(Arg0&& arg0, Arg1&& arg1, Arg2&& arg2) {            \
     return Op()                                                        \
         .WithOpcode(HloOpcode::k##NAME)                                \
         .WithOperand(0, std::forward<Arg0>(arg0))                      \
@@ -2222,12 +2115,7 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
   template <typename HloInstructionType, typename Arg0, typename Arg1, \
             typename Arg2>                                             \
   inline auto NAME(HloInstructionType** matched_inst, Arg0&& arg0,     \
-                   Arg1&& arg1, Arg2&& arg2)                           \
-      ->decltype(Op(matched_inst)                                      \
-                     .WithOpcode(HloOpcode::k##NAME)                   \
-                     .WithOperand(0, std::forward<Arg0>(arg0))         \
-                     .WithOperand(1, std::forward<Arg1>(arg1))         \
-                     .WithOperand(2, std::forward<Arg2>(arg2))) {      \
+                   Arg1&& arg1, Arg2&& arg2) {                         \
     return Op(matched_inst)                                            \
         .WithOpcode(HloOpcode::k##NAME)                                \
         .WithOperand(0, std::forward<Arg0>(arg0))                      \
@@ -2241,17 +2129,13 @@ XLA_TERNOP_PATTERN(Select);
 
 namespace detail {
 template <typename Matcher, typename FirstArg>
-inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg)
-    -> decltype(m.WithOperand(operand_num, std::forward<FirstArg>(first_arg))) {
+inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg) {
   return m.WithOperand(operand_num, std::forward<FirstArg>(first_arg));
 }
 
 template <typename Matcher, typename FirstArg, typename... Args>
 inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
-                         Args&&... args)
-    -> decltype(WithOperands(m.WithOperand(operand_num,
-                                           std::forward<FirstArg>(first_arg)),
-                             operand_num + 1, std::forward<Args>(args)...)) {
+                         Args&&... args) {
   return WithOperands(
       m.WithOperand(operand_num, std::forward<FirstArg>(first_arg)),
       operand_num + 1, std::forward<Args>(args)...);
@@ -2259,26 +2143,17 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 }  // namespace detail
 
 #define XLA_VARIADIC_OP_PATTERN(NAME)                                         \
-  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {         \
-    return Op().WithOpcode(HloOpcode::k##NAME);                               \
-  }                                                                           \
+  inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); }          \
                                                                               \
   template <typename... Args>                                                 \
-  inline auto NAME(Args&&... args)                                            \
-      ->decltype(detail::WithOperands(Op().WithOpcode(HloOpcode::k##NAME)     \
-                                          .WithNumOperands(sizeof...(Args)),  \
-                                      0, std::forward<Args>(args)...)) {      \
+  inline auto NAME(Args&&... args) {                                          \
     return detail::WithOperands(                                              \
         Op().WithOpcode(HloOpcode::k##NAME).WithNumOperands(sizeof...(Args)), \
         /*operand_num=*/0, std::forward<Args>(args)...);                      \
   }                                                                           \
                                                                               \
   template <typename HloInstructionType, typename... Args>                    \
-  inline auto NAME(HloInstructionType** matched_inst, Args&&... args)         \
-      ->decltype(detail::WithOperands(Op(matched_inst)                        \
-                                          .WithOpcode(HloOpcode::k##NAME)     \
-                                          .WithNumOperands(sizeof...(Args)),  \
-                                      0, std::forward<Args>(args)...)) {      \
+  inline auto NAME(HloInstructionType** matched_inst, Args&&... args) {       \
     return detail::WithOperands(Op(matched_inst)                              \
                                     .WithOpcode(HloOpcode::k##NAME)           \
                                     .WithNumOperands(sizeof...(Args)),        \
@@ -2299,63 +2174,46 @@ XLA_VARIADIC_OP_PATTERN(Sort);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for comparison instructions.
-#define XLA_COMPARE_PATTERN(NAME)                                              \
-  inline auto NAME()->decltype(                                                \
-      Op().WithOpcode(HloOpcode::kCompare)                                     \
-          .WithComparisonDirection(ComparisonDirection::k##NAME)) {            \
-    return Op()                                                                \
-        .WithOpcode(HloOpcode::kCompare)                                       \
-        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
-  }                                                                            \
-                                                                               \
-  template <typename Lhs, typename Rhs>                                        \
-  inline auto NAME(Lhs&& lhs, Rhs&& rhs)                                       \
-      ->decltype(Op().WithOpcode(HloOpcode::kCompare)                          \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                   \
-                     .WithOperand(1, std::forward<Rhs>(rhs))                   \
-                     .WithComparisonDirection(ComparisonDirection::k##NAME)) { \
-    return Op()                                                                \
-        .WithOpcode(HloOpcode::kCompare)                                       \
-        .WithOperand(0, std::forward<Lhs>(lhs))                                \
-        .WithOperand(1, std::forward<Rhs>(rhs))                                \
-        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
-  }                                                                            \
-                                                                               \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>           \
-  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs)    \
-      ->decltype(Op(matched_inst)                                              \
-                     .WithOpcode(HloOpcode::kCompare)                          \
-                     .WithOperand(0, std::forward<Lhs>(lhs))                   \
-                     .WithOperand(1, std::forward<Rhs>(rhs))                   \
-                     .WithComparisonDirection(ComparisonDirection::k##NAME)) { \
-    return Op(matched_inst)                                                    \
-        .WithOpcode(HloOpcode::kCompare)                                       \
-        .WithOperand(0, std::forward<Lhs>(lhs))                                \
-        .WithOperand(1, std::forward<Rhs>(rhs))                                \
-        .WithComparisonDirection(ComparisonDirection::k##NAME);                \
+#define XLA_COMPARE_PATTERN(NAME)                                             \
+  inline auto NAME() {                                                        \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }                                                                           \
+                                                                              \
+  template <typename Lhs, typename Rhs>                                       \
+  inline auto NAME(Lhs&& lhs, Rhs&& rhs) {                                    \
+    return Op()                                                               \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs))                               \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>          \
+  inline auto NAME(HloInstructionType** matched_inst, Lhs&& lhs, Rhs&& rhs) { \
+    return Op(matched_inst)                                                   \
+        .WithOpcode(HloOpcode::kCompare)                                      \
+        .WithOperand(0, std::forward<Lhs>(lhs))                               \
+        .WithOperand(1, std::forward<Rhs>(rhs))                               \
+        .WithComparisonDirection(ComparisonDirection::k##NAME);               \
   }
 
-#define XLA_COMMUTATIVE_COMPARE_PATTERN(NAME)                               \
-  XLA_COMPARE_PATTERN(NAME)                                                 \
-                                                                            \
-  template <typename HloInstructionType, typename Lhs, typename Rhs>        \
-  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
-                             Rhs&& rhs)                                     \
-      ->decltype(Op(matched_inst)                                           \
-                     .WithOpcode(HloOpcode::kCompare)                       \
-                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
-                                                 std::forward<Rhs>(rhs))) { \
-    return Op(matched_inst)                                                 \
-        .WithOpcode(HloOpcode::kCompare)                                    \
-        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
-                                    std::forward<Rhs>(rhs));                \
-  }                                                                         \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
-          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
-    return NAME##AnyOrder<const HloInstruction>(                            \
-        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
+#define XLA_COMMUTATIVE_COMPARE_PATTERN(NAME)                              \
+  XLA_COMPARE_PATTERN(NAME)                                                \
+                                                                           \
+  template <typename HloInstructionType, typename Lhs, typename Rhs>       \
+  inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs, \
+                             Rhs&& rhs) {                                  \
+    return Op(matched_inst)                                                \
+        .WithOpcode(HloOpcode::kCompare)                                   \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                \
+                                    std::forward<Rhs>(rhs));               \
+  }                                                                        \
+  template <typename Lhs, typename Rhs>                                    \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs) {                       \
+    return NAME##AnyOrder<const HloInstruction>(                           \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));          \
   }
 
 XLA_COMMUTATIVE_COMPARE_PATTERN(Eq);
@@ -2366,23 +2224,17 @@ XLA_COMPARE_PATTERN(Le);
 XLA_COMPARE_PATTERN(Lt);
 
 // Helpers for matching non-constant instructions.
-inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
-  return Op().IsNonConstant();
-}
+inline auto NonConstant() { return Op().IsNonConstant(); }
 
 template <typename HloInstructionType>
-inline auto NonConstant(HloInstructionType** matched_inst)
-    -> decltype(Op(matched_inst).IsNonConstant()) {
+inline auto NonConstant(HloInstructionType** matched_inst) {
   return Op(matched_inst).IsNonConstant();
 }
 
 // Add overloads for GetTupleElement which take a int64 specifying which tuple
 // element is selected.
 template <typename Arg>
-inline auto GetTupleElement(Arg&& arg, int64 tuple_index)
-    -> decltype(Op().WithOpcode(HloOpcode::kGetTupleElement)
-                    .WithOperand(0, std::forward<Arg>(arg))
-                    .WithTupleIndex(tuple_index)) {
+inline auto GetTupleElement(Arg&& arg, int64 tuple_index) {
   return Op()
       .WithOpcode(HloOpcode::kGetTupleElement)
       .WithOperand(0, std::forward<Arg>(arg))
@@ -2391,11 +2243,7 @@ inline auto GetTupleElement(Arg&& arg, int64 tuple_index)
 
 template <typename HloInstructionType, typename Arg>
 inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
-                            int64 tuple_index)
-    -> decltype(Op(matched_inst)
-                    .WithOpcode(HloOpcode::kGetTupleElement)
-                    .WithOperand(0, std::forward<Arg>(arg))
-                    .WithTupleIndex(tuple_index)) {
+                            int64 tuple_index) {
   return Op(matched_inst)
       .WithOpcode(HloOpcode::kGetTupleElement)
       .WithOperand(0, std::forward<Arg>(arg))
@@ -2404,62 +2252,50 @@ inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
 
 // Add overloads for Parameter which take an int64 specifying the parameter
 // number.
-inline auto Parameter(int64 parameter_num) -> decltype(
-    Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num)) {
+inline auto Parameter(int64 parameter_num) {
   return Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num);
 }
 template <typename HloInstructionType>
-inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num)
-    -> decltype(Op(matched_inst)
-                    .WithOpcode(HloOpcode::kParameter)
-                    .WithParameterNum(parameter_num)) {
+inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num) {
   return Op(matched_inst)
       .WithOpcode(HloOpcode::kParameter)
       .WithParameterNum(parameter_num);
 }
 
-inline auto ConstantScalar() -> decltype(Op().IsConstantScalar()) {
-  return Op().IsConstantScalar();
-}
+inline auto ConstantScalar() { return Op().IsConstantScalar(); }
 
 template <typename HloInstructionType>
-inline auto ConstantScalar(HloInstructionType** matched_inst)
-    -> decltype(Op(matched_inst).IsConstantScalar()) {
+inline auto ConstantScalar(HloInstructionType** matched_inst) {
   return Op(matched_inst).IsConstantScalar();
 }
 
 template <typename ScalarTy>
-inline auto ConstantScalar(ScalarTy val)
-    -> decltype(Op().IsConstantScalar(val)) {
+inline auto ConstantScalar(ScalarTy val) {
   return Op().IsConstantScalar(val);
 }
 
 template <typename HloInstructionType, typename ScalarTy>
-inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val)
-    -> decltype(Op(matched_inst).IsConstantScalar(val)) {
+inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val) {
   return Op(matched_inst).IsConstantScalar(val);
 }
 
-inline auto ConstantEffectiveScalar() -> decltype(Op().IsConstantScalar()) {
+inline auto ConstantEffectiveScalar() {
   return Op().IsConstantEffectiveScalar();
 }
 
 template <typename HloInstructionType>
-inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst)
-    -> decltype(Op(matched_inst).IsConstantScalar()) {
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst) {
   return Op(matched_inst).IsConstantEffectiveScalar();
 }
 
 template <typename ScalarTy>
-inline auto ConstantEffectiveScalar(ScalarTy val)
-    -> decltype(Op().IsConstantEffectiveScalar(val)) {
+inline auto ConstantEffectiveScalar(ScalarTy val) {
   return Op().IsConstantEffectiveScalar(val);
 }
 
 template <typename HloInstructionType, typename ScalarTy>
 inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst,
-                                    ScalarTy val)
-    -> decltype(Op(matched_inst).IsConstantEffectiveScalar(val)) {
+                                    ScalarTy val) {
   return Op(matched_inst).IsConstantEffectiveScalar(val);
 }
 
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index e3a3feb8640..bd99f920ea0 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -325,6 +325,22 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
       {updated_operand, scatter_indices, updates}};
 }
 
+static int64 ScatterTripCount(HloInstruction* scatter) {
+  // Compute the trip count for the while loop to be used for scatter. This
+  // should be the number of indices we should scatter into the operand.
+  HloInstruction* scatter_indices = scatter->mutable_operand(1);
+  const Shape& scatter_indices_shape = scatter_indices->shape();
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+  int64 scatter_loop_trip_count = 1;
+  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      scatter_loop_trip_count *= scatter_indices_shape.dimensions(i);
+    }
+  }
+  return scatter_loop_trip_count;
+}
+
 // High Level Algorithm.
 //
 // 1. Canonicalize the scatter_indices tensor such that it has rank 2, where
@@ -342,7 +358,7 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
 //         from c. and d. using the update_computation of scatter.
 //      f. Write the updated value of the slice into the operand tensor.
 
-StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
+StatusOr<HloInstruction*> ScatterExpander::ExpandInstruction(
     HloInstruction* scatter) {
   HloInstruction* operand = scatter->mutable_operand(0);
   HloInstruction* scatter_indices = scatter->mutable_operand(1);
@@ -358,13 +374,7 @@ StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
 
   // Compute the trip count for the while loop to be used for scatter. This
   // should be the number of indices we should scatter into the operand.
-  const Shape& scatter_indices_shape = scatter_indices->shape();
-  int64 scatter_loop_trip_count = 1;
-  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
-    if (i != dim_numbers.index_vector_dim()) {
-      scatter_loop_trip_count *= scatter_indices_shape.dimensions(i);
-    }
-  }
+  int64 scatter_loop_trip_count = ScatterTripCount(scatter);
   if (!IsInt32(scatter_loop_trip_count)) {
     return Unimplemented(
         "Scatter operations with more than 2147483647 scatter indices are not "
@@ -408,23 +418,9 @@ StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
   return scatter_loop_result.front();
 }
 
-StatusOr<bool> ScatterExpander::Run(HloModule* module) {
-  std::vector<HloInstruction*> scatter_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instr : computation->instructions()) {
-      if (instr->opcode() == HloOpcode::kScatter) {
-        scatter_instrs.push_back(instr);
-      }
-    }
-  }
-
-  for (auto instr : scatter_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(instr));
-    TF_RETURN_IF_ERROR(
-        instr->parent()->ReplaceInstruction(instr, expanded_root));
-  }
-
-  return !scatter_instrs.empty();
+bool ScatterExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kScatter &&
+         (mode_ == kEliminateAllScatters || ScatterTripCount(inst) == 1);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_expander.h b/tensorflow/compiler/xla/service/scatter_expander.h
index 533af060bc9..aa59e7ec3b0 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.h
+++ b/tensorflow/compiler/xla/service/scatter_expander.h
@@ -16,17 +16,43 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
 
-class ScatterExpander : public HloModulePass {
+// This pass rewrites scatter operations into (roughly) while loops of
+// dynamic-update-slices.
+//
+// This pass can be used in two ways:
+//
+//   - kEliminateAllScatters: For backends that don't support scatter, this pass
+//     can convert every scatter into a loop.
+//
+//   - kEliminateSimpleScatters: For backends that *do* support scatter, this
+//     pass can strength-reduce "simple" scatters -- specifically, scatters that
+//     can be represented without a loop -- to dynamic-update-slices.
+//
+// Note that even in kEliminateSimpleScatters mode, this pass may still expand a
+// scatter into a loop (with a trip-count of 1).  It's up to other
+// simplification passes to remove the loop.
+class ScatterExpander : public OpExpanderPass {
  public:
+  enum Mode {
+    kEliminateAllScatters,
+    kEliminateSimpleScatters,
+  };
+
+  explicit ScatterExpander(Mode m) : mode_(m) {}
+
   absl::string_view name() const override { return "scatter_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  StatusOr<HloInstruction*> ExpandScatter(HloInstruction* scatter);
+  bool InstructionMatchesPattern(HloInstruction* inst) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(HloInstruction* scatter) override;
+
+ private:
+  Mode mode_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_expander_test.cc b/tensorflow/compiler/xla/service/scatter_expander_test.cc
index 3852b82c1ef..9f4cc5406d8 100644
--- a/tensorflow/compiler/xla/service/scatter_expander_test.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander_test.cc
@@ -57,11 +57,79 @@ TEST_F(ScatterExpanderTest, ScatterOperandWithoutLayout) {
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   // The HLO parser changes all no layout shapes from the input to have a
-  // default layout, clear the layout of the scatter operand for testing.
+  // default layout. Clear the layout of the scatter operand for testing.
   HloInstruction* scatter_operand = FindInstruction(module.get(), "operand");
   scatter_operand->mutable_shape()->clear_layout();
 
-  ScatterExpander scatter_expander;
+  ScatterExpander scatter_expander(ScatterExpander::kEliminateAllScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_TRUE(result);
+}
+
+TEST_F(ScatterExpanderTest, EliminateSimpleScattersSkipsNontrivialScatter) {
+  const char* kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      parameter0 = s32[] parameter(0)
+      ROOT parameter1 = s32[] parameter(1)
+    }
+
+    ENTRY kernel_entry {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=scatter_computation,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  // The HLO parser changes all no layout shapes from the input to have a
+  // default layout. Clear the layout of the scatter operand for testing.
+  HloInstruction* scatter_operand = FindInstruction(module.get(), "operand");
+  scatter_operand->mutable_shape()->clear_layout();
+
+  ScatterExpander scatter_expander(ScatterExpander::kEliminateSimpleScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(ScatterExpanderTest, EliminateSimpleScattersRewritesTrivialScatter) {
+  const char* kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      parameter0 = s32[] parameter(0)
+      ROOT parameter1 = s32[] parameter(1)
+    }
+
+    ENTRY kernel_entry {
+      operand = s32[5] iota(), iota_dimension=0
+      indices = s32[1] parameter(0)
+      update = s32[] constant(0)
+      ROOT scatter = s32[5]{0} scatter(operand, indices, update),
+        update_window_dims={}, inserted_window_dims={0},
+        scatter_dims_to_operand_dims={0}, index_vector_dim=0,
+        to_apply=scatter_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  // The HLO parser changes all no layout shapes from the input to have a
+  // default layout. Clear the layout of the scatter operand for testing.
+  HloInstruction* scatter_operand = FindInstruction(module.get(), "operand");
+  scatter_operand->mutable_shape()->clear_layout();
+
+  ScatterExpander scatter_expander(ScatterExpander::kEliminateSimpleScatters);
   TF_ASSERT_OK_AND_ASSIGN(bool result,
                           RunHloPass(&scatter_expander, module.get()));
   EXPECT_TRUE(result);
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 8e39e32e4c3..a96c9c34260 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2825,6 +2825,38 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return output_shape;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferDynamicReshapeShape(
+    const Shape& operand, absl::Span<const Shape* const> dim_size_shapes,
+    absl::Span<const int64> new_size_bounds,
+    const std::vector<bool>& dims_are_dynamic) {
+  if (new_size_bounds.size() != dims_are_dynamic.size()) {
+    return InvalidArgument(
+        "DynamicReshape has to have the same number of elements in new_sizes "
+        "(%d) and dims_are_dynamic (%d)",
+        new_size_bounds.size(), dims_are_dynamic.size());
+  }
+
+  for (const Shape* dim_size_shape : dim_size_shapes) {
+    if (dim_size_shape->element_type() != S32 && dim_size_shape->rank() != 0) {
+      return InvalidArgument(
+          "DynamicReshape's dim size has to be scalar S32, got (%s): ",
+          dim_size_shape->ToString());
+    }
+  }
+
+  Shape inferred_shape = ShapeUtil::MakeShape(
+      operand.element_type(), new_size_bounds, dims_are_dynamic);
+  if (ShapeUtil::ElementsIn(operand) != ShapeUtil::ElementsIn(inferred_shape)) {
+    return InvalidArgument(
+        "Reshape operation has mismatched element counts: from=%d (%s) "
+        "to=%d (%s).",
+        ShapeUtil::ElementsIn(operand), ShapeUtil::HumanString(operand),
+        ShapeUtil::ElementsIn(inferred_shape),
+        ShapeUtil::HumanString(inferred_shape));
+  }
+  return inferred_shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferReshapeShape(
     const Shape& operand, absl::Span<const int64> dimensions,
     absl::Span<const int64> new_sizes, int64 inferred_dimension) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index d47d96ab52d..f03e4e5fa98 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -241,6 +241,15 @@ class ShapeInference {
                                            absl::Span<const int64> new_sizes,
                                            int64 inferred_dimension);
 
+  // Infers the shape produced by a dynamic reshape operation from the element
+  // type of its operand and the new dimension sizes specified. The result shape
+  // will have dynamic dimensions as specific in `dim_is_dynamic` and bound
+  // `new_size_bounds`.
+  static StatusOr<Shape> InferDynamicReshapeShape(
+      const Shape& operand, absl::Span<const Shape* const> dim_size_shapes,
+      absl::Span<const int64> new_size_bounds,
+      const std::vector<bool>& dims_are_dynamic);
+
   // Infers the shape produced by a transpose operation from the element type of
   // its operand and its dimensions field.
   static StatusOr<Shape> InferTransposeShape(
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index bcbebf3460f..7136ce82e25 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -120,34 +120,34 @@ HloSharding MergeForMoreSpecificSharding(const HloSharding& a,
   return IsShardingMoreSpecific(a, b) ? a : b;
 }
 
-// Returns a sharding that is refined by merging old and to_merge. May combine
-// partial sharding in addition to MergeForMoreSpecificSharding().
-HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
-                          bool may_combine_partial_sharding) {
+// Tries to refine `to_merge` by combining with `old`. Returns if the final
+// `to_merge` is more specific than `old`. May combine partial sharding in
+// addition to MergeForMoreSpecificSharding().
+bool MergeSharding(const HloSharding& old, HloSharding* to_merge,
+                   bool may_combine_partial_sharding) {
   if (old.IsTuple()) {
-    HloSharding result = old;
-    CHECK(to_merge.IsTuple());
-    CHECK_EQ(old.tuple_elements().size(), to_merge.tuple_elements().size());
-    for (int64 i = 0; i < result.tuple_elements().size(); ++i) {
-      result.tuple_elements()[i] =
-          MergeSharding(old.tuple_elements()[i], to_merge.tuple_elements()[i],
+    CHECK(to_merge->IsTuple());
+    bool changed = false;
+    for (int64 i = 0; i < old.tuple_elements().size(); ++i) {
+      changed |=
+          MergeSharding(old.tuple_elements()[i], &to_merge->tuple_elements()[i],
                         may_combine_partial_sharding);
     }
-    return result;
+    return changed;
   }
   if (!may_combine_partial_sharding || !old.ReplicateOnLastTileDim() ||
-      !to_merge.ReplicateOnLastTileDim() ||
+      !to_merge->ReplicateOnLastTileDim() ||
       old.tile_assignment().num_elements() !=
-          to_merge.tile_assignment().num_elements()) {
-    return IsShardingMoreSpecific(to_merge, old) ? to_merge : old;
+          to_merge->tile_assignment().num_elements()) {
+    return IsShardingMoreSpecific(*to_merge, old);
   }
   // Combine the tile dimension sizes from new and old.
   int64 num_devices = old.tile_assignment().num_elements();
   std::vector<int64> new_tile_dims;
   bool compatible = true;
-  new_tile_dims.reserve(to_merge.tile_assignment().num_dimensions());
-  for (int64 i = 0; i < to_merge.tile_assignment().num_dimensions() - 1; ++i) {
-    int64 new_dim = to_merge.tile_assignment().dim(i);
+  new_tile_dims.reserve(to_merge->tile_assignment().num_dimensions());
+  for (int64 i = 0; i < to_merge->tile_assignment().num_dimensions() - 1; ++i) {
+    int64 new_dim = to_merge->tile_assignment().dim(i);
     int64 old_dim = old.tile_assignment().dim(i);
     if (new_dim == 1) {
       new_tile_dims.push_back(old_dim);
@@ -163,7 +163,7 @@ HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
   int64 replication = num_devices / Product(new_tile_dims);
   if (!compatible || num_devices % Product(new_tile_dims) != 0 ||
       replication >= old.tile_assignment().dimensions().back()) {
-    return IsShardingMoreSpecific(to_merge, old) ? to_merge : old;
+    return IsShardingMoreSpecific(*to_merge, old);
   }
   new_tile_dims.push_back(replication);
   Array<int64> new_tile(new_tile_dims);
@@ -174,7 +174,7 @@ HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
                              const HloSharding& sharding) {
     int64 group_id = 0;
     for (int64 i = 0; i < tile_indices.size() - 1; ++i) {
-      group_id *= to_merge.tile_assignment().dim(i);
+      group_id *= to_merge->tile_assignment().dim(i);
       group_id += tile_indices[i];
     }
     return group_id;
@@ -183,9 +183,9 @@ HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
       [&](absl::Span<const int64> indices, int64 device) {
         old_group_members[get_group_index(indices, old)].insert(device);
       });
-  to_merge.tile_assignment().Each(
+  to_merge->tile_assignment().Each(
       [&](absl::Span<const int64> indices, int64 device) {
-        new_group_members[get_group_index(indices, to_merge)].insert(device);
+        new_group_members[get_group_index(indices, *to_merge)].insert(device);
       });
   // Try to find the intersection of old and new replication groups, in
   // order to determine the merged tile assignment.
@@ -199,12 +199,12 @@ HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
       if (old.tile_assignment().dim(i) == 1) {
         old_index[i] = 0;
       }
-      if (to_merge.tile_assignment().dim(i) == 1) {
+      if (to_merge->tile_assignment().dim(i) == 1) {
         new_index[i] = 0;
       }
     }
     int64 old_group_id = get_group_index(old_index, old);
-    int64 new_group_id = get_group_index(new_index, to_merge);
+    int64 new_group_id = get_group_index(new_index, *to_merge);
     if (old_group_members[old_group_id].empty() ||
         new_group_members[new_group_id].empty() ||
         *old_group_members[old_group_id].begin() !=
@@ -220,11 +220,13 @@ HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
     if (replication == 1) {
       new_tile_dims.pop_back();
       new_tile.Reshape(new_tile_dims);
-      return HloSharding::Tile(new_tile);
+      *to_merge = HloSharding::Tile(new_tile);
+    } else {
+      *to_merge = HloSharding::PartialTile(new_tile);
     }
-    return HloSharding::PartialTile(new_tile);
+    return true;
   }
-  return IsShardingMoreSpecific(to_merge, old) ? to_merge : old;
+  return IsShardingMoreSpecific(*to_merge, old);
 }
 
 // Updates the sharding of the specified instruction with the specified sharding
@@ -232,7 +234,7 @@ HloSharding MergeSharding(const HloSharding& old, const HloSharding& to_merge,
 // been applied. If may_combine_partial_sharding is true, this may combine the
 // new and existing sharding if they are both partial tiling partial
 // replication.
-bool MaybeImproveInstructionSharding(const HloSharding& sharding,
+bool MaybeImproveInstructionSharding(HloSharding sharding,
                                      HloInstruction* instruction,
                                      bool may_combine_partial_sharding) {
   // We don't want to propagate tile maximal shardings.
@@ -241,13 +243,13 @@ bool MaybeImproveInstructionSharding(const HloSharding& sharding,
   }
   // Any sharding is better then no sharding.
   if (!instruction->has_sharding()) {
-    instruction->set_sharding(sharding);
+    instruction->set_sharding(std::move(sharding));
     return true;
   }
-  auto merged = MergeSharding(instruction->sharding(), sharding,
+  auto merged = MergeSharding(instruction->sharding(), &sharding,
                               may_combine_partial_sharding);
-  if (merged != instruction->sharding()) {
-    instruction->set_sharding(merged);
+  if (merged) {
+    instruction->set_sharding(std::move(sharding));
     return true;
   }
   return false;
@@ -387,6 +389,7 @@ const HloInstruction* PickRepresentativeOperand(
     case HloOpcode::kDot:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kDynamicReshape:
     case HloOpcode::kFft:
     case HloOpcode::kFusion:
     case HloOpcode::kGather:
@@ -538,7 +541,7 @@ bool InferDotShardingFromOperands(
 
 // Convolution handling for InferShardingFromOperands().
 bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
-                                          bool aggressive_prop,
+                                          int64 aggressiveness,
                                           bool may_combine_partial_sharding) {
   if (auto dot_dims = dot_as_convolution_util::ParseDotGeneralFromConvolution(
           instruction)) {
@@ -586,12 +589,27 @@ bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
                                          may_combine_partial_sharding);
 }
 
+bool CanPropagateThroughAtAgressiveLevel(const HloInstruction& inst,
+                                         int64 aggressiveness) {
+  // At minimum agressiveness, only allow pass-through ops.
+  if (aggressiveness < 1 && !inst.IsElementwise() &&
+      inst.opcode() != HloOpcode::kTranspose &&
+      inst.opcode() != HloOpcode::kReshape) {
+    return false;
+  }
+  return true;
+}
+
 // Tries to update the sharding of the specified instruction based on its
 // operands and returns true if the sharding of the instruction have been
 // changed and false otherwise.
 bool InferShardingFromOperands(HloInstruction* instruction,
                                const ComputationMap& computation_map,
-                               bool is_spmd, bool aggressive_prop) {
+                               bool is_spmd, int64 aggressiveness) {
+  if (!CanPropagateThroughAtAgressiveLevel(*instruction, aggressiveness)) {
+    return false;
+  }
+  const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
     // If an array shaped HLO doesn't support spatial partitioning but at least
     // one of its operand is replicated then we make the HLO replicated as well.
@@ -604,8 +622,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
           return op->has_sharding() && op->sharding().IsReplicated();
         })) {
       return MaybeImproveInstructionSharding(
-          HloSharding::Replicate(), instruction,
-          /*may_combine_partial_sharding=*/is_spmd);
+          HloSharding::Replicate(), instruction, may_combine_partial_sharding);
     }
     return false;
   }
@@ -619,7 +636,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       HloSharding new_sharding = operand->sharding().GetSubSharding(
           operand->shape(), {instruction->tuple_index()});
       return MaybeImproveInstructionSharding(
-          new_sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
+          std::move(new_sharding), instruction, may_combine_partial_sharding);
     }
     case HloOpcode::kTuple: {
       if (absl::c_none_of(instruction->operands(),
@@ -684,12 +701,12 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         if (!IsSpatiallyPartitioned(operand)) {
           continue;
         }
-        auto get_maybe_tuple_sharding = [&](const HloSharding& sharding) {
+        auto get_maybe_tuple_sharding = [&](HloSharding sharding) {
           if (instruction->operand_count() == 2) {
             return sharding;
           }
           std::vector<HloSharding> tuple(instruction->operand_count() / 2,
-                                         sharding);
+                                         std::move(sharding));
           return HloSharding::Tuple(instruction->shape(), tuple);
         };
         if (operand->sharding().IsReplicated() ||
@@ -701,7 +718,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
           // support this in SPMD.
           changed |= MaybeImproveInstructionSharding(
               get_maybe_tuple_sharding(HloSharding::Replicate()), instruction,
-              /*may_combine_partial_sharding=*/is_spmd);
+              may_combine_partial_sharding);
           continue;
         }
         auto after_partial_replication =
@@ -712,7 +729,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         if (after_partial_replication.IsReplicated()) {
           changed |= MaybeImproveInstructionSharding(
               get_maybe_tuple_sharding(HloSharding::Replicate()), instruction,
-              /*may_combine_partial_sharding=*/is_spmd);
+              may_combine_partial_sharding);
           continue;
         }
         // Use the same sharding for all tuple elements, because they are part
@@ -721,8 +738,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             get_maybe_tuple_sharding(hlo_sharding_util::RemoveShapeDimensions(
                 after_partial_replication, instruction->dimensions()));
         changed |= MaybeImproveInstructionSharding(
-            new_sharding, instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+            std::move(new_sharding), instruction, may_combine_partial_sharding);
       }
       return changed;
     }
@@ -763,12 +779,11 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               ? HloSharding::PartialTile(new_tile_assignment)
               : HloSharding::Tile(new_tile_assignment);
       return MaybeImproveInstructionSharding(
-          new_sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
+          std::move(new_sharding), instruction, may_combine_partial_sharding);
     }
     case HloOpcode::kConvolution:
-      return InferConvolutionShardingFromOperands(
-          instruction, aggressive_prop,
-          /*may_combine_partial_sharding=*/is_spmd);
+      return InferConvolutionShardingFromOperands(instruction, aggressiveness,
+                                                  may_combine_partial_sharding);
     case HloOpcode::kTranspose: {
       const HloInstruction* input = instruction->operand(0);
       if (!IsSpatiallyPartitioned(input)) {
@@ -776,8 +791,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
       HloSharding sharding = hlo_sharding_util::TransposeSharding(
           input->sharding(), instruction->dimensions());
-      return MaybeImproveInstructionSharding(
-          sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
+      return MaybeImproveInstructionSharding(std::move(sharding), instruction,
+                                             may_combine_partial_sharding);
     }
     case HloOpcode::kReduceWindow: {
       const HloInstruction* lhs = instruction->operand(0);
@@ -795,9 +810,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 << instruction->ToString();
         return false;
       }
-      return MaybeImproveInstructionSharding(
-          lhs->sharding(), instruction,
-          /*may_combine_partial_sharding=*/is_spmd);
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction,
+                                             may_combine_partial_sharding);
     }
     case HloOpcode::kSelectAndScatter: {
       // Shard according to first operand, as output keeps the same shape.
@@ -816,9 +830,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 << instruction->ToString();
         return false;
       }
-      return MaybeImproveInstructionSharding(
-          lhs->sharding(), instruction,
-          /*may_combine_partial_sharding=*/is_spmd);
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction,
+                                             may_combine_partial_sharding);
     }
     case HloOpcode::kReshape: {
       if (!IsSpatiallyPartitioned(instruction->operand(0))) {
@@ -829,9 +842,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               instruction->operand(0)->shape(), instruction->shape(),
               instruction->operand(0)->sharding());
       if (new_sharding.has_value()) {
-        return MaybeImproveInstructionSharding(
-            new_sharding.value(), instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+        return MaybeImproveInstructionSharding(std::move(*new_sharding),
+                                               instruction,
+                                               may_combine_partial_sharding);
       }
       return false;
     }
@@ -842,14 +855,13 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       return MaybeImproveInstructionSharding(
           hlo_sharding_util::ReverseSharding(
               instruction->operand(0)->sharding(), instruction->dimensions()),
-          instruction, /*may_combine_partial_sharding=*/is_spmd);
+          instruction, may_combine_partial_sharding);
     }
     case HloOpcode::kDot: {
       const auto& dnums =
           dot_as_convolution_util::ParseDotGeneralFromDot(instruction);
-      return InferDotShardingFromOperands(
-          instruction, dnums,
-          /*may_combine_partial_sharding=*/is_spmd);
+      return InferDotShardingFromOperands(instruction, dnums,
+                                          may_combine_partial_sharding);
     }
     case HloOpcode::kParameter: {
       auto parent_it = computation_map.find(instruction->parent());
@@ -864,7 +876,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
               if (parent->operand(i)->has_sharding()) {
                 return MaybeImproveInstructionSharding(
                     parent->operand(i)->sharding(), instruction,
-                    /*may_combine_partial_sharding=*/is_spmd);
+                    may_combine_partial_sharding);
               }
               return false;
             }
@@ -891,16 +903,15 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       if (instruction->shape().IsTuple()) {
         return MaybeImproveInstructionSharding(
             HloSharding::SingleTuple(instruction->shape(), operand->sharding()),
-            instruction, /*may_combine_partial_sharding=*/is_spmd);
+            instruction, may_combine_partial_sharding);
       } else {
-        return MaybeImproveInstructionSharding(
-            operand->sharding(), instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+        return MaybeImproveInstructionSharding(operand->sharding(), instruction,
+                                               may_combine_partial_sharding);
       }
     }
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice: {
-      auto propagate_slicing = [instruction, is_spmd]() {
+      auto propagate_slicing = [&]() {
         const HloInstruction* operand =
             instruction->opcode() == HloOpcode::kDynamicSlice
                 ? instruction->operand(0)
@@ -910,9 +921,9 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         }
 
         if (operand->sharding().IsReplicated()) {
-          return MaybeImproveInstructionSharding(
-              HloSharding::Replicate(), instruction,
-              /*may_combine_partial_sharding=*/is_spmd);
+          return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                                 instruction,
+                                                 may_combine_partial_sharding);
         }
 
         const auto& tile_assignment = operand->sharding().tile_assignment();
@@ -923,11 +934,10 @@ bool InferShardingFromOperands(HloInstruction* instruction,
             return false;
           }
         }
-        return MaybeImproveInstructionSharding(
-            operand->sharding(), instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+        return MaybeImproveInstructionSharding(operand->sharding(), instruction,
+                                               may_combine_partial_sharding);
       };
-      auto propagate_base = [instruction, is_spmd]() {
+      auto propagate_base = [&]() {
         if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
           return false;
         }
@@ -936,7 +946,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         }
         return MaybeImproveInstructionSharding(
             instruction->operand(0)->sharding(), instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+            may_combine_partial_sharding);
       };
       return propagate_slicing() || propagate_base();
     }
@@ -946,8 +956,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         HloSharding new_sharding = hlo_sharding_util::GatherOutputSharding(
             instruction->operand(1)->sharding(), instruction);
         changed |= MaybeImproveInstructionSharding(
-            new_sharding, instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+            std::move(new_sharding), instruction, may_combine_partial_sharding);
       }
       if (is_spmd && IsSpatiallyPartitioned(instruction->operand(0))) {
         auto maybe_from_data =
@@ -955,8 +964,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 instruction->operand(0)->sharding(), *instruction);
         if (maybe_from_data) {
           changed |= MaybeImproveInstructionSharding(
-              *maybe_from_data, instruction,
-              /*may_combine_partial_sharding=*/is_spmd);
+              std::move(*maybe_from_data), instruction,
+              may_combine_partial_sharding);
         }
       }
       return changed;
@@ -966,7 +975,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       if (is_spmd && IsSpatiallyPartitioned(instruction->operand(0))) {
         changed |= MaybeImproveInstructionSharding(
             instruction->operand(0)->sharding(), instruction,
-            /*may_combine_partial_sharding=*/is_spmd);
+            may_combine_partial_sharding);
       }
       if (!IsSpatiallyPartitioned(instruction->operand(1)) &&
           !IsSpatiallyPartitioned(instruction->operand(2))) {
@@ -978,13 +987,12 @@ bool InferShardingFromOperands(HloInstruction* instruction,
                 instruction->operand(2)->sharding(), *instruction);
         if (maybe_from_update) {
           changed |= MaybeImproveInstructionSharding(
-              *maybe_from_update, instruction,
-              /*may_combine_partial_sharding=*/is_spmd);
+              std::move(*maybe_from_update), instruction,
+              may_combine_partial_sharding);
         }
       }
       changed |= MaybeImproveInstructionSharding(
-          HloSharding::Replicate(), instruction,
-          /*may_combine_partial_sharding=*/is_spmd);
+          HloSharding::Replicate(), instruction, may_combine_partial_sharding);
       return changed;
     }
     case HloOpcode::kWhile: {
@@ -996,17 +1004,16 @@ bool InferShardingFromOperands(HloInstruction* instruction,
         sharding =
             MergeForMoreSpecificSharding(sharding, instruction->sharding());
       }
-      return MaybeImproveInstructionSharding(
-          sharding, instruction, /*may_combine_partial_sharding=*/is_spmd);
+      return MaybeImproveInstructionSharding(std::move(sharding), instruction,
+                                             may_combine_partial_sharding);
     }
     default: {
-      if (instruction->IsElementwise() && is_spmd) {
+      if (instruction->IsElementwise() && may_combine_partial_sharding) {
         bool changed = false;
         for (auto operand : instruction->operands()) {
           if (IsSpatiallyPartitioned(operand)) {
             changed |= MaybeImproveInstructionSharding(
-                operand->sharding(), instruction,
-                /*may_combine_partial_sharding=*/is_spmd);
+                operand->sharding(), instruction, may_combine_partial_sharding);
           }
         }
         return changed;
@@ -1015,9 +1022,8 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       if (!operand || !IsSpatiallyPartitioned(operand)) {
         return false;
       }
-      return MaybeImproveInstructionSharding(
-          operand->sharding(), instruction,
-          /*may_combine_partial_sharding=*/is_spmd);
+      return MaybeImproveInstructionSharding(operand->sharding(), instruction,
+                                             may_combine_partial_sharding);
     }
   }
   return false;
@@ -1088,12 +1094,14 @@ HloSharding InferDotOperandSharding(
       operand_to_other_dims[operand_index == 0 ? dim.lhs : dim.rhs] =
           operand_index == 0 ? dim.rhs : dim.lhs;
     }
-    sharding =
-        MergeSharding(sharding,
-                      *hlo_sharding_util::TransposeShardingWithCollapsedDims(
-                          other_operand_dims_replicated, other_to_operand_dims,
-                          operand_to_other_dims),
-                      may_combine_partial_sharding);
+    HloSharding sharding_from_other =
+        *hlo_sharding_util::TransposeShardingWithCollapsedDims(
+            other_operand_dims_replicated, other_to_operand_dims,
+            operand_to_other_dims);
+    if (MergeSharding(sharding, &sharding_from_other,
+                      may_combine_partial_sharding)) {
+      sharding = std::move(sharding_from_other);
+    }
   }
   return sharding;
 }
@@ -1101,10 +1109,14 @@ HloSharding InferDotOperandSharding(
 // Return the sharding that should be propagated from user to instruction.
 absl::optional<HloSharding> GetShardingFromUser(
     const HloInstruction& instruction, const HloInstruction& user,
-    bool aggressive_prop, bool is_spmd) {
+    int64 aggressiveness, bool is_spmd) {
+  if (!CanPropagateThroughAtAgressiveLevel(user, aggressiveness)) {
+    return absl::nullopt;
+  }
   if (!IsSpatiallyPartitioned(&user)) {
     return absl::nullopt;
   }
+  const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   switch (user.opcode()) {
     case HloOpcode::kBroadcast: {
       if (user.sharding().IsReplicated()) {
@@ -1176,9 +1188,8 @@ absl::optional<HloSharding> GetShardingFromUser(
       if (auto dot_dims =
               dot_as_convolution_util::ParseDotGeneralFromConvolution(&user)) {
         int64 op_idx = user.operand_index(&instruction);
-        return InferDotOperandSharding(
-            &user, *dot_dims, op_idx,
-            /*may_combine_partial_sharding=*/is_spmd);
+        return InferDotOperandSharding(&user, *dot_dims, op_idx,
+                                       may_combine_partial_sharding);
       }
       return absl::nullopt;
     }
@@ -1263,7 +1274,7 @@ absl::optional<HloSharding> GetShardingFromUser(
       int64 op_idx = user.operand_index(&instruction);
       auto dnums = dot_as_convolution_util::ParseDotGeneralFromDot(&user);
       return InferDotOperandSharding(&user, dnums, op_idx,
-                                     /*may_combine_partial_sharding=*/is_spmd);
+                                     may_combine_partial_sharding);
     }
     case HloOpcode::kReduce: {
       if (instruction.shape().rank() == 0) {
@@ -1364,18 +1375,18 @@ absl::optional<HloSharding> GetShardingFromUser(
 // false otherwise.
 bool InferShardingFromUsers(HloInstruction* instruction,
                             const ComputationMap& computation_map,
-                            bool aggressive_prop, bool is_spmd) {
+                            int64 aggressiveness, bool is_spmd) {
   if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
     return false;
   }
   bool improved_sharding = false;
+  const bool may_combine_partial_sharding = is_spmd && aggressiveness > 0;
   for (const HloInstruction* user : instruction->users()) {
     absl::optional<HloSharding> user_sharding =
-        GetShardingFromUser(*instruction, *user, aggressive_prop, is_spmd);
+        GetShardingFromUser(*instruction, *user, aggressiveness, is_spmd);
     if (user_sharding) {
       improved_sharding |= MaybeImproveInstructionSharding(
-          *user_sharding, instruction,
-          /*may_combine_partial_sharding=*/is_spmd);
+          std::move(*user_sharding), instruction, may_combine_partial_sharding);
     }
   }
   return improved_sharding;
@@ -1645,10 +1656,18 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
   // strictly improve the sharding of the graph and it can't be improved
   // indefinitely.
   int64 iterations = 0;
-  auto run_to_fix_point = [&](bool aggressive_prop) {
-    bool changed = true;
-    while (changed) {
-      changed = false;
+  auto run_to_fix_point = [&](int64 aggressiveness) {
+    absl::flat_hash_set<const HloInstruction*> workset;
+    for (const HloComputation* computation : module->computations()) {
+      for (const HloInstruction* instruction : computation->instructions()) {
+        // Remove the instructions where the sharding was provided from the
+        // outside so we don't modify them.
+        if (!provided_shardings.contains(instruction)) {
+          workset.insert(instruction);
+        }
+      }
+    }
+    while (!workset.empty()) {
       int64 inferred_from_operand_counter = 0;
       int64 inferred_from_user_counter = 0;
       int64 instruction_counter = 0;
@@ -1662,12 +1681,10 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
           already_sharded_counter += (instruction->has_sharding() ? 1 : 0);
         }
 
-        // Remove the instructions where the sharding was provided from the
-        // outside so we don't modify them.
         instructions.erase(
             std::remove_if(instructions.begin(), instructions.end(),
                            [&](HloInstruction* instruction) {
-                             return provided_shardings.contains(instruction);
+                             return !workset.contains(instruction);
                            }),
             instructions.end());
 
@@ -1675,28 +1692,40 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
         // operands.
         for (HloInstruction* instruction : instructions) {
           if (InferShardingFromOperands(instruction, computation_map, is_spmd_,
-                                        aggressive_prop)) {
+                                        aggressiveness)) {
             ++inferred_from_operand_counter;
-            changed = true;
+            any_changed = true;
             VLOG(2) << "Add sharding (forward-pass): "
                     << instruction->ToString();
             maybe_computation_propagation(instruction);
+            for (auto user : instruction->users()) {
+              if (!provided_shardings.contains(user)) {
+                workset.insert(user);
+              }
+            }
+          } else {
+            workset.erase(instruction);
           }
         }
 
         // Then iterate the HLO graph in reverse post order taking shardings
         // from users.
         for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
-          if (InferShardingFromUsers(*it, computation_map, aggressive_prop,
+          if (InferShardingFromUsers(*it, computation_map, aggressiveness,
                                      is_spmd_)) {
             ++inferred_from_user_counter;
-            changed = true;
+            any_changed = true;
             VLOG(2) << "Add sharding (backward-pass): " << (*it)->ToString();
             maybe_computation_propagation(*it);
+            workset.insert(*it);
+            for (auto operand : (*it)->operands()) {
+              if (!provided_shardings.contains(operand)) {
+                workset.insert(operand);
+              }
+            }
           }
         }
       }
-      any_changed |= changed;
       VLOG(1) << "Sharding propagation iteration " << iterations << ";";
       VLOG(1) << "  total instructions: " << instruction_counter;
       VLOG(1) << "  instructions already sharded: " << already_sharded_counter;
@@ -1707,8 +1736,8 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
       ++iterations;
     }
   };
-  run_to_fix_point(false);
-  run_to_fix_point(true);
+  run_to_fix_point(0);
+  run_to_fix_point(1);
 
   VLOG(1) << "Sharding propagation completed after " << iterations
           << " iterations";
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index 5ed1398149b..03c77c2038c 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -556,6 +556,43 @@ ENTRY %replicated {
               op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialReplicateReshapeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[1430,1]{1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %reshape = f32[10,11,13]{2,1,0} reshape(%param0)
+  ROOT %copy = f32[10,11,13]{2,1,0} copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "reshape"),
+      op::Sharding("{devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialReplicateReshapeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[2002,1]{1,0} parameter(0)
+  %copy = f32[2002,1]{1,0} copy(f32[2002,1]{1,0} %param0)
+  ROOT %reshape = f32[14,11,13]{2,1,0} reshape(%copy),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, DontShardTuplesIfAllInputIsMaximal) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1779,6 +1816,52 @@ ENTRY entry {
               op::Sharding("{devices=[2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherToIndex2) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = bf16[2,4819,4] parameter(0), sharding={replicated}
+  %p1 = s32[2,1000,2] parameter(1)
+  %indices = s32[2,1000,2] copy(%p1)
+  ROOT %gather = bf16[2,1000,4]
+    gather(bf16[2,4819,4] %input, s32[2,1000,2] %indices),
+    offset_dims={2}, collapsed_slice_dims={0,1},
+    start_index_map={0,1}, index_vector_dim=2, slice_sizes={1,1,4},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[1,2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GatherToIndex3) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = bf16[2,4819,4] parameter(0), sharding={replicated}
+  %p1 = s32[2,2,1000] parameter(1)
+  %indices = s32[2,2,1000] copy(%p1)
+  ROOT %gather = bf16[2,1000,4]
+    gather(bf16[2,4819,4] %input, s32[2,2,1000] %indices),
+    offset_dims={2}, collapsed_slice_dims={0,1},
+    start_index_map={0,1}, index_vector_dim=1, slice_sizes={1,1,4},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherToDataOperand) {
   const char* hlo_string = R"(
 HloModule module
@@ -2039,5 +2122,45 @@ ENTRY entry {
       op::Sharding("{devices=[2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialShardingTransposeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0),
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %transpose = f32[11,13,7]{2,1,0} transpose(%param), dimensions={1,2,0}
+  ROOT %copy = f32[11,13,7]{2,1,0} copy(%transpose)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "transpose"),
+      op::Sharding(
+          "{devices=[1,2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, PartialShardingTransposeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0)
+  %copy = f32[7,11,13]{2,1,0} copy(%param)
+  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%copy), dimensions={1,2,0},
+    sharding={devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "copy"),
+      op::Sharding(
+          "{devices=[2,1,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index dd3da796d61..d2243d30adf 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -74,3 +74,16 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+cc_library(
+    name = "schedule_aware_all_gather_cse",
+    srcs = ["schedule_aware_all_gather_cse.cc"],
+    hdrs = ["schedule_aware_all_gather_cse.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index a24bafe26ce..da432965497 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -100,7 +100,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     int64 output_rhs_non_contracting_partitions,
     int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
-        windowed_dot_general_loops) {
+        windowed_dot_general_loops,
+    bool may_reshard_without_detecting_match) {
   const HloSharding& lhs_sharding = lhs.sharding();
   const HloSharding& rhs_sharding = rhs.sharding();
   if (lhs_sharding.ReplicateOnLastTileDim() ||
@@ -491,29 +492,36 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     return dot;
   }
 
-  // Output is batch partitioned.
-  if (output_batch_partitions == num_partitions) {
-    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
-    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
-                                                     resharded_rhs.hlo(), b));
-    return dot;
-  }
-  // Output is partitioned along LHS non-contracting dimensions.
-  if (output_lhs_non_contracting_partitions == num_partitions) {
-    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
-    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
-                                                     replicated_rhs.hlo(), b));
-    return dot;
-  }
-  // Output is partitioned along RHS non-contracting dimensions.
-  if (output_rhs_non_contracting_partitions == num_partitions) {
-    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
-    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
-                                                     resharded_rhs.hlo(), b));
-    return dot;
+  if (may_reshard_without_detecting_match) {
+    // Output is batch partitioned.
+    if (output_batch_partitions == num_partitions) {
+      auto resharded_lhs =
+          lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+      auto resharded_rhs =
+          rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                       resharded_rhs.hlo(), b));
+      return dot;
+    }
+    // Output is partitioned along LHS non-contracting dimensions.
+    if (output_lhs_non_contracting_partitions == num_partitions) {
+      auto resharded_lhs =
+          lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+      auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), b));
+      return dot;
+    }
+    // Output is partitioned along RHS non-contracting dimensions.
+    if (output_rhs_non_contracting_partitions == num_partitions) {
+      auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+      auto resharded_rhs =
+          rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                       resharded_rhs.hlo(), b));
+      return dot;
+    }
   }
 
   // Returns true if it is beneficial to reshard the operand at `operand_idx`
@@ -808,7 +816,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
 StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
     bool lhs_matching, PartitionedHlo matching, PartitionedHlo other,
     int64 matching_contracting_partitions, int64 other_contracting_partitions,
-    int64 matching_non_contracting_partitions,
+    absl::Span<const DotGeneralDimsMapping::DimsMapping>
+        partitioned_non_contractin_dims,
     int64 other_non_contracting_partitions,
     int64 output_other_non_contracting_partitions,
     const Shape& output_base_shape, const HloSharding& output_sharding,
@@ -828,48 +837,20 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
     }
   });
 
-  const bool may_replicate_other_contracting_dims =
-      (other_contracting_partitions == matching_non_contracting_partitions &&
-       other_non_contracting_partitions ==
-           output_other_non_contracting_partitions);
-  const bool may_replicate_other_non_contracting_dims =
-      matching_non_contracting_partitions == other_non_contracting_partitions &&
-      matching_contracting_partitions == other_contracting_partitions;
-  std::vector<int64> other_group_dims;
-  if (may_replicate_other_contracting_dims &&
-      (!may_replicate_other_non_contracting_dims ||
-       ShapeUtil::ByteSizeOf(other.hlo()->shape()) <=
-           ShapeUtil::ByteSizeOf(
-               MakePartitionedShape(output_base_shape, output_sharding)))) {
-    for (const auto& dim : dims_mapping.contracting_dims) {
-      other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
-    }
-  } else if (may_replicate_other_non_contracting_dims) {
-    for (const auto& dim : lhs_matching
-                               ? dims_mapping.rhs_non_contracting_dims
-                               : dims_mapping.lhs_non_contracting_dims) {
-      other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
-    }
-  } else if (!(other.sharding().ReplicateOnLastTileDim() &&
-               other.sharding().tile_assignment().dimensions().back() %
-                       matching_non_contracting_partitions ==
-                   0) &&
-             !other.sharding().IsReplicated()) {
-    return nullptr;
-  }
   auto matching_sharding_dims =
       matching.sharding().tile_assignment().dimensions();
   std::vector<int64> matching_dims;
   std::vector<int64> output_dims;
+  int64 group_count = 1;
   // Make sure the partitioning on matching's non-contracting dimensions
   // defines the same device groups for both matching and output.
-  for (const auto& dim : lhs_matching ? dims_mapping.lhs_non_contracting_dims
-                                      : dims_mapping.rhs_non_contracting_dims) {
+  for (const auto& dim : partitioned_non_contractin_dims) {
     int64 md = lhs_matching ? dim.lhs : dim.rhs;
     matching_sharding_dims[md] =
         output_sharding.tile_assignment().dim(dim.output);
     matching_dims.push_back(md);
     output_dims.push_back(dim.output);
+    group_count *= output_sharding.tile_assignment().dim(dim.output);
   }
   auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
   auto reshaped_matching_tiling = matching.sharding().tile_assignment();
@@ -885,6 +866,42 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
       matching.sharding() != UngroupSharding(matching_grouped)) {
     return nullptr;
   }
+
+  std::vector<int64> other_group_dims;
+  if (other.sharding().ReplicateOnLastTileDim() &&
+      other.sharding().tile_assignment().dimensions().back() % group_count ==
+          0) {
+    other_group_dims.push_back(other.base_shape().rank());
+  } else {
+    const bool may_replicate_other_contracting_dims =
+        (other_contracting_partitions == group_count &&
+         other_non_contracting_partitions ==
+             output_other_non_contracting_partitions);
+    const bool may_replicate_other_non_contracting_dims =
+        group_count == other_non_contracting_partitions &&
+        matching_contracting_partitions == other_contracting_partitions;
+    if (auto found_dims = FindMatchingPartitionedDimsForGrouping(
+            other.sharding(), output_grouped.device_groups)) {
+      other_group_dims = std::move(*found_dims);
+    } else if (may_replicate_other_contracting_dims &&
+               (!may_replicate_other_non_contracting_dims ||
+                ShapeUtil::ByteSizeOf(other.hlo()->shape()) <=
+                    ShapeUtil::ByteSizeOf(MakePartitionedShape(
+                        output_base_shape, output_sharding)))) {
+      for (const auto& dim : dims_mapping.contracting_dims) {
+        other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+      }
+    } else if (may_replicate_other_non_contracting_dims) {
+      for (const auto& dim : lhs_matching
+                                 ? dims_mapping.rhs_non_contracting_dims
+                                 : dims_mapping.lhs_non_contracting_dims) {
+        other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
+      }
+    } else {
+      other = other.Replicate();
+    }
+  }
+
   matching = matching.Reshard(UngroupSharding(matching_grouped));
   auto per_group_partitioner_state = CreatePerGroupPartitioningState(
       matching.state(), matching_grouped.device_groups, b);
@@ -896,16 +913,14 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
       per_group_partitioner_state);
 
   auto partially_replicated_other = other.hlo();
-  if (other.sharding().ReplicateOnLastTileDim() &&
-      other.sharding().tile_assignment().dimensions().back() %
-              matching_non_contracting_partitions ==
-          0) {
+  if (other_group_dims.size() == 1 &&
+      other_group_dims[0] == other.base_shape().rank()) {
+    // Group on replication dim.
     auto grouped = AlignGroupsWith(
         GroupShardingOnDims(
-            other.sharding(),
-            {other.sharding().tile_assignment().num_dimensions() - 1},
+            other.sharding(), {other_group_dims[0]},
             {other.sharding().tile_assignment().dimensions().back() /
-             matching_non_contracting_partitions}),
+             group_count}),
         output_grouped);
     other = other.Reshard(UngroupSharding(grouped));
     partially_replicated_other = other.hlo();
@@ -916,9 +931,13 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
         AlignGroupsWith(GroupShardingOnDims(other.sharding(), other_group_dims),
                         output_grouped, /*ignore_group_order=*/true);
     other = other.Reshard(UngroupSharding(other_grouped));
-    // TODO(yuanzx): Use reshard to replicate when ready.
     partially_replicated_other =
-        other.ReplicatePartial(other_grouped.group_dims);
+        other
+            .Reshard(hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+                other.sharding(), other_grouped.group_dims))
+            .hlo();
+    top_level_sharding_to_reset.emplace_back(
+        partially_replicated_other, partially_replicated_other->sharding());
     partially_replicated_other->set_sharding(other_grouped.sharding);
   }
   auto other_p = PartitionedHlo(partially_replicated_other, other.base_shape(),
@@ -937,7 +956,9 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
 }
 
 StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
-    PartitionedHlo lhs, PartitionedHlo rhs, int64 contracting_partitions,
+    PartitionedHlo lhs, PartitionedHlo rhs,
+    absl::Span<const DotGeneralDimsMapping::DimsMapping>
+        partitioned_contractin_dims,
     int64 output_batch_partitions, int64 output_lhs_non_contracting_partitions,
     int64 output_rhs_non_contracting_partitions, const Shape& output_base_shape,
     const HloSharding& output_sharding,
@@ -962,13 +983,15 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
   auto rhs_tile_shape = rhs_sharding.tile_assignment().dimensions();
   std::vector<int64> lhs_dims;
   std::vector<int64> rhs_dims;
-  for (const auto& dim : dims_mapping.contracting_dims) {
+  int64 group_count = 1;
+  for (const auto& dim : partitioned_contractin_dims) {
     lhs_dims.push_back(dim.lhs);
     rhs_dims.push_back(dim.rhs);
+    group_count *= lhs_sharding.tile_assignment().dim(dim.lhs);
   }
   if (ShapeUtil::ByteSizeOf(lhs.hlo()->shape()) >
       ShapeUtil::ByteSizeOf(rhs.hlo()->shape())) {
-    for (const auto& dim : dims_mapping.contracting_dims) {
+    for (const auto& dim : partitioned_contractin_dims) {
       rhs_tile_shape[dim.rhs] = lhs_tile_shape[dim.lhs];
     }
     auto new_tile = rhs.sharding().tile_assignment();
@@ -977,7 +1000,7 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
                        ? HloSharding::PartialTile(new_tile)
                        : HloSharding::Tile(new_tile);
   } else {
-    for (const auto& dim : dims_mapping.contracting_dims) {
+    for (const auto& dim : partitioned_contractin_dims) {
       lhs_tile_shape[dim.lhs] = rhs_tile_shape[dim.rhs];
     }
     auto new_tile = lhs.sharding().tile_assignment();
@@ -1012,43 +1035,47 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
   HloSharding inner_output_sharding = HloSharding::Replicate();
   HloSharding outer_output_tmp_sharding = HloSharding::Replicate();
   if (output_sharding.ReplicateOnLastTileDim() &&
-      output_sharding.tile_assignment().dimensions().back() %
-              contracting_partitions ==
+      output_sharding.tile_assignment().dimensions().back() % group_count ==
           0) {
     auto grouped = AlignGroupsWith(
         GroupShardingOnDims(
             output_sharding,
             {output_sharding.tile_assignment().num_dimensions() - 1},
             {output_sharding.tile_assignment().dimensions().back() /
-             contracting_partitions}),
-        GroupShardingOnDims(lhs_sharding, lhs_dims));
+             group_count}),
+        lhs_grouped);
     outer_output_tmp_sharding = UngroupSharding(grouped);
     inner_output_sharding = std::move(grouped.sharding);
-  } else if (output_lhs_non_contracting_partitions == contracting_partitions ||
-             output_rhs_non_contracting_partitions == contracting_partitions ||
-             output_batch_partitions == contracting_partitions) {
+  } else {
     std::vector<int64> group_dims;
-    if (output_lhs_non_contracting_partitions == contracting_partitions) {
-      for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
-        group_dims.push_back(dim.output);
-      }
-    } else if (output_rhs_non_contracting_partitions ==
-               contracting_partitions) {
-      for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
-        group_dims.push_back(dim.output);
-      }
-    } else {
-      for (const auto& dim : dims_mapping.batch_dims) {
-        group_dims.push_back(dim.output);
+    if (auto found_dims = FindMatchingPartitionedDimsForGrouping(
+            output_sharding, lhs_grouped.device_groups)) {
+      group_dims = std::move(*found_dims);
+    } else if (output_lhs_non_contracting_partitions == group_count ||
+               output_rhs_non_contracting_partitions == group_count ||
+               output_batch_partitions == group_count) {
+      if (output_lhs_non_contracting_partitions == group_count) {
+        for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
+          group_dims.push_back(dim.output);
+        }
+      } else if (output_rhs_non_contracting_partitions == group_count) {
+        for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
+          group_dims.push_back(dim.output);
+        }
+      } else {
+        for (const auto& dim : dims_mapping.batch_dims) {
+          group_dims.push_back(dim.output);
+        }
       }
     }
-    auto grouped =
-        AlignGroupsWith(GroupShardingOnDims(output_sharding, group_dims),
-                        GroupShardingOnDims(lhs_sharding, lhs_dims));
-    inner_output_sharding = grouped.sharding;
-    outer_output_tmp_sharding =
-        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-            UngroupSharding(grouped), group_dims);
+    if (!group_dims.empty()) {
+      auto grouped = AlignGroupsWith(
+          GroupShardingOnDims(output_sharding, group_dims), lhs_grouped);
+      inner_output_sharding = grouped.sharding;
+      outer_output_tmp_sharding =
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              UngroupSharding(grouped), group_dims);
+    }
   }
   auto inner_state = CreatePerGroupPartitioningState(
       lhs.state(), lhs_grouped.device_groups, b);
@@ -1062,10 +1089,9 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
                          GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
                          inner_state),
           MakePartitionedShape(output_base_shape, outer_output_tmp_sharding),
-          inner_output_sharding, dims_mapping,
-          num_partitions / contracting_partitions, create_sharded_dot, module,
-          original_hlo, threshold_for_windowed_einsum_mib, b,
-          windowed_dot_general_loops));
+          inner_output_sharding, dims_mapping, num_partitions / group_count,
+          create_sharded_dot, module, original_hlo,
+          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
   if (!dot) {
     return nullptr;
   }
@@ -1141,6 +1167,8 @@ StatusOr<HloInstruction*> PartitionDot(
       output_sharding, dims_mapping.lhs_non_contracting_dims, 2);
   const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
       output_sharding, dims_mapping.rhs_non_contracting_dims, 2);
+  // Before we find partial matches along the dimensions, invoke base case again
+  // without may_reshard_without_detecting_match.
   TF_ASSIGN_OR_RETURN(
       auto try_partitioned_dot,
       PartitionBaseCase(
@@ -1151,7 +1179,8 @@ StatusOr<HloInstruction*> PartitionDot(
           lhs_non_contracting_partitions, rhs_non_contracting_partitions,
           output_lhs_non_contracting_partitions,
           output_rhs_non_contracting_partitions,
-          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops,
+          /*may_reshard_without_detecting_match=*/false));
   if (try_partitioned_dot) {
     return try_partitioned_dot;
   }
@@ -1202,8 +1231,8 @@ StatusOr<HloInstruction*> PartitionDot(
                          : rhs_contracting_partitions,
             lhs_matching ? rhs_contracting_partitions
                          : lhs_contracting_partitions,
-            lhs_matching ? lhs_non_contracting_partitions
-                         : rhs_non_contracting_partitions,
+            lhs_matching ? dims_mapping.lhs_non_contracting_dims
+                         : dims_mapping.rhs_non_contracting_dims,
             lhs_matching ? rhs_non_contracting_partitions
                          : lhs_non_contracting_partitions,
             lhs_matching ? output_rhs_non_contracting_partitions
@@ -1216,6 +1245,62 @@ StatusOr<HloInstruction*> PartitionDot(
       return dot;
     }
   }
+  if (lhs_non_contracting_partitions > 1 &&
+      output_lhs_non_contracting_partitions > 1) {
+    // If part of LHS non-contracting dims match output, try them.
+    std::vector<DotGeneralDimsMapping::DimsMapping> matching_dims;
+    for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
+      int64 lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
+      if (lhs_partitions > 1 &&
+          lhs_partitions == output_sharding.tile_assignment().dim(dim.output)) {
+        matching_dims.push_back(dim);
+      }
+    }
+    if (!matching_dims.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          PartitionDotGroupOnNonContracting(
+              /*lhs_matching=*/true, lhs, rhs, lhs_contracting_partitions,
+              rhs_contracting_partitions, matching_dims,
+              rhs_non_contracting_partitions,
+              output_rhs_non_contracting_partitions, output_base_shape,
+              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+              module, original_hlo, require_matching_devices_to_group,
+              threshold_for_windowed_einsum_mib, b,
+              windowed_dot_general_loops));
+      if (dot) {
+        return dot;
+      }
+    }
+  }
+  if (rhs_non_contracting_partitions > 1 &&
+      output_rhs_non_contracting_partitions > 1) {
+    // If part of RHS non-contracting dims match output, try them.
+    std::vector<DotGeneralDimsMapping::DimsMapping> matching_dims;
+    for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
+      int64 rhs_partitions = rhs.sharding().tile_assignment().dim(dim.rhs);
+      if (rhs_partitions > 1 &&
+          rhs_partitions == output_sharding.tile_assignment().dim(dim.output)) {
+        matching_dims.push_back(dim);
+      }
+    }
+    if (!matching_dims.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          PartitionDotGroupOnNonContracting(
+              /*lhs_matching=*/false, rhs, lhs, rhs_contracting_partitions,
+              lhs_contracting_partitions, matching_dims,
+              lhs_non_contracting_partitions,
+              output_lhs_non_contracting_partitions, output_base_shape,
+              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+              module, original_hlo, require_matching_devices_to_group,
+              threshold_for_windowed_einsum_mib, b,
+              windowed_dot_general_loops));
+      if (dot) {
+        return dot;
+      }
+    }
+  }
 
   // Case 3: Group partitions by contracting dimensions.
   if (lhs_contracting_partitions == rhs_contracting_partitions &&
@@ -1223,7 +1308,7 @@ StatusOr<HloInstruction*> PartitionDot(
     TF_ASSIGN_OR_RETURN(
         auto dot,
         PartitionDotGroupOnContracting(
-            lhs, rhs, lhs_contracting_partitions, output_batch_partitions,
+            lhs, rhs, dims_mapping.contracting_dims, output_batch_partitions,
             output_lhs_non_contracting_partitions,
             output_rhs_non_contracting_partitions, output_base_shape,
             output_sharding, dims_mapping, num_partitions, create_sharded_dot,
@@ -1233,6 +1318,71 @@ StatusOr<HloInstruction*> PartitionDot(
       return dot;
     }
   }
+  if (lhs_contracting_partitions > 1 && rhs_contracting_partitions > 1) {
+    // If part of contracting dims match, try them.
+    std::vector<DotGeneralDimsMapping::DimsMapping> matching_dims;
+    for (const auto& dim : dims_mapping.contracting_dims) {
+      int64 lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
+      if (lhs_partitions > 1 &&
+          lhs_partitions == rhs.sharding().tile_assignment().dim(dim.rhs)) {
+        matching_dims.push_back(dim);
+      }
+    }
+    if (!matching_dims.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          auto dot,
+          PartitionDotGroupOnContracting(
+              lhs, rhs, matching_dims, output_batch_partitions,
+              output_lhs_non_contracting_partitions,
+              output_rhs_non_contracting_partitions, output_base_shape,
+              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
+              module, original_hlo, require_matching_devices_to_group,
+              threshold_for_windowed_einsum_mib, b,
+              windowed_dot_general_loops));
+      if (dot) {
+        return dot;
+      }
+    }
+  }
+
+  // Case 4: If operands are replicated but output is partially replicated,
+  // recursive call with partial replication removed.
+  if (lhs.sharding().IsReplicated() && rhs.sharding().IsReplicated() &&
+      output_sharding.ReplicateOnLastTileDim()) {
+    auto grouped_output =
+        GroupShardingOnDims(output_sharding, {output_base_shape.rank()});
+    auto inner_state = CreatePerGroupPartitioningState(
+        lhs.state(), grouped_output.device_groups, b);
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        PartitionDot(PartitionedHlo(lhs.hlo(), lhs.base_shape(), inner_state),
+                     PartitionedHlo(rhs.hlo(), rhs.base_shape(), inner_state),
+                     output_base_shape, grouped_output.sharding, dims_mapping,
+                     output_sharding.NumTiles(), create_sharded_dot, module,
+                     original_hlo, threshold_for_windowed_einsum_mib, b,
+                     windowed_dot_general_loops));
+    if (dot) {
+      return dot;
+    }
+  }
+
+  // We failed to find partial matches, invoke base case again with
+  // may_reshard_without_detecting_match.
+  TF_ASSIGN_OR_RETURN(
+      auto dot,
+      PartitionBaseCase(
+          lhs, rhs, output_base_shape, output_sharding, dims_mapping,
+          num_partitions, create_sharded_dot, module, original_hlo,
+          lhs_batch_partitions, rhs_batch_partitions, output_batch_partitions,
+          lhs_contracting_partitions, rhs_contracting_partitions,
+          lhs_non_contracting_partitions, rhs_non_contracting_partitions,
+          output_lhs_non_contracting_partitions,
+          output_rhs_non_contracting_partitions,
+          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops,
+          /*may_reshard_without_detecting_match=*/true));
+  if (dot) {
+    return dot;
+  }
   return nullptr;
 }
 
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
new file mode 100644
index 00000000000..cc97d5ebda7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+namespace {
+
+HloCollectiveInstruction* MayConsiderAsAllGather(HloInstruction* hlo,
+                                                 bool for_replicas) {
+  auto coll = DynCast<HloCollectiveInstruction>(hlo);
+  if (!coll) {
+    return nullptr;
+  }
+  if (coll->constrain_layout()) {
+    return nullptr;
+  }
+  if (for_replicas == coll->channel_id().has_value()) {
+    return nullptr;
+  }
+  if (coll->opcode() == HloOpcode::kAllGather) {
+    return coll;
+  }
+  // Consider broadcast -> dynamic-update-slice -> all-reduce as all-gather.
+  if (coll->opcode() == HloOpcode::kAllReduce && coll->shape().IsArray()) {
+    auto operand = coll->operand(0);
+    return operand->opcode() == HloOpcode::kDynamicUpdateSlice &&
+                   operand->operand(0)->opcode() == HloOpcode::kBroadcast
+               ? coll
+               : nullptr;
+  }
+  return nullptr;
+}
+
+StatusOr<bool> RunOnComputation(HloComputation* comp, bool for_replicas,
+                                int64 distance_threshold) {
+  // We consider estimate the live ranges of all-gathers by comparing their
+  // users' distance to the root, e.g., height.
+  absl::flat_hash_map<const HloInstruction*, int64> height;
+  auto ordered_hlos = comp->MakeInstructionPostOrder();
+  int64 max_height = 0;
+  for (auto it = ordered_hlos.rbegin(); it != ordered_hlos.rend(); ++it) {
+    auto hlo = *it;
+    int64 h = 0;
+    for (auto user : hlo->users()) {
+      h = std::max(h, height[user]) + 1;
+    }
+    max_height = std::max(max_height, h);
+    height[hlo] = h;
+  }
+
+  auto lowest_user_height = [&](const HloInstruction* hlo) {
+    int64 lowest = height[hlo];
+    for (auto user : hlo->users()) {
+      lowest = std::min(lowest, height[user]);
+    }
+    return lowest;
+  };
+
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<HloCollectiveInstruction*>>
+      operand_to_ag;
+  bool changed = false;
+  for (auto hlo : ordered_hlos) {
+    auto ag = MayConsiderAsAllGather(hlo, for_replicas);
+    if (!ag) {
+      continue;
+    }
+
+    auto& earlier_ags = operand_to_ag[ag->operand(0)];
+    bool found = false;
+    int64 lowest_user_h = lowest_user_height(ag);
+    for (auto& eag : earlier_ags) {
+      auto old_channel_id = ag->channel_id();
+      if (eag->channel_id() && ag->channel_id()) {
+        ag->set_channel_id(eag->channel_id());
+      }
+      if (!eag->Identical(*ag)) {
+        ag->set_channel_id(old_channel_id);
+        continue;
+      }
+      found = true;
+      ag->set_channel_id(old_channel_id);
+      if (lowest_user_height(eag) > lowest_user_h + distance_threshold) {
+        eag = ag;
+        continue;
+      }
+      changed = true;
+      VLOG(1) << "Replacing " << ag->ToString() << " with " << eag->ToString();
+      TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(eag));
+      break;
+    }
+    if (!found) {
+      earlier_ags.push_back(ag);
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> ScheduleAwareAllGatherCSE::Run(HloModule* module) {
+  bool changed = false;
+  for (auto comp : module->computations()) {
+    TF_ASSIGN_OR_RETURN(
+        auto comp_changed,
+        RunOnComputation(comp, for_replicas_, distance_threshold_));
+    changed |= comp_changed;
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h
new file mode 100644
index 00000000000..4653286ae97
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_ALL_GATHER_CSE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_ALL_GATHER_CSE_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Performs CSE for all-gather if their users are within reasonable live range.
+class ScheduleAwareAllGatherCSE : public HloModulePass {
+ public:
+  // distance_threshold: maximum live range (in number of HLO instructions on
+  //   the path) to consider CSE.
+  // for_replicas: specifies if this pass is for cross-replica or
+  //   cross-partition all-gathers.
+  explicit ScheduleAwareAllGatherCSE(int64 distance_threshold,
+                                     bool for_replicas)
+      : distance_threshold_(distance_threshold), for_replicas_(for_replicas) {}
+
+  ~ScheduleAwareAllGatherCSE() override = default;
+  absl::string_view name() const override {
+    return "schedule-aware-all-gather-cse";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  int64 distance_threshold_;
+  bool for_replicas_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_ALL_GATHER_CSE_H_
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index a850c05600e..f16b7bacda3 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -221,15 +221,23 @@ HloInstruction* SpmdBuilder::AddInstruction(
 
 PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target) {
   auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
-  for (auto& entry : cache) {
-    if (entry.first == target) {
-      return entry.second;
+  const bool is_to_replicate =
+      hlo_->shape().IsArray() && target.NumTiles() < sharding().NumTiles();
+  if (!is_to_replicate || state_.partitioner->options().cache_all_gather) {
+    for (auto& entry : cache) {
+      if (entry.first == target) {
+        return entry.second;
+      }
     }
   }
-  cache.emplace_back(target, ReshardNoCache(target));
-  state_.reshard_cache->per_hlo_cache[cache.back().second.hlo()]
+  auto resharded = ReshardNoCache(target);
+  state_.reshard_cache->per_hlo_cache[resharded.hlo()]
       .reshard_cache.emplace_back(sharding(), *this);
-  return cache.back().second;
+  if (!is_to_replicate || state_.partitioner->options().cache_all_gather) {
+    cache.emplace_back(target, std::move(resharded));
+    return cache.back().second;
+  }
+  return resharded;
 }
 
 PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
@@ -282,133 +290,17 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
     return ReshardWithAllToAll(target, *src_tgt_dims);
   }
 
-  // Partial replicated to tiled.
-  if (sharding().ReplicateOnLastTileDim() && !target.ReplicateOnLastTileDim() &&
-      !target.IsTileMaximal()) {
-    // Get the temp sharding target from partial replicate to target tile dims.
-    // target_compatible_sharding has the same tile_assignment dimensions
-    // as the target and can reshard to target by collective permute.
-    // target_compatible_sharding could have different device assignment as
-    // targe. sharding() can reshard to target_compatible_sharding by
-    // dynamic slice.
-    auto target_compatible_sharding = PartialReplicateToTileCompatibleSharding(
-        sharding(), target.tile_assignment().dimensions());
-    // Reshard to target_compatible_sharding by dynamic slice.
-    if (target_compatible_sharding.has_value()) {
-      std::vector<int64> expand_tile_dims;
-      std::vector<int64> tiling_dim_factors;
-      int64 rank = shape.rank();
-      tiling_dim_factors.reserve(rank);
-      auto temp_target_sharding = target_compatible_sharding.value();
-      for (int64 dim = 0; dim < rank; dim++) {
-        if (temp_target_sharding.tile_assignment().dim(dim) >
-            sharding().tile_assignment().dim(dim)) {
-          expand_tile_dims.push_back(dim);
-        }
-        tiling_dim_factors.emplace_back(
-            temp_target_sharding.tile_assignment().dim(dim) /
-            sharding().tile_assignment().dim(dim));
-      }
-
-      // Get per_group partitioner state.
-      std::vector<int64> group_dims(
-          sharding().tile_assignment().num_dimensions() - 1);
-      std::iota(group_dims.begin(), group_dims.end(), 0);
-      auto sharding_grouped = GroupShardingOnDims(sharding(), group_dims);
-      auto per_group_partitioner_state = CreatePerGroupPartitioningState(
-          state_, sharding_grouped.device_groups, state_.b);
-      // 2. Get the padded_hlo, do right halo exchange if needed.
-      auto padded_hlo = PadFromPartialReplicateShape(
-          hlo_, base_shape_, sharding(), temp_target_sharding, expand_tile_dims,
-          state_.collective_ops_creator, state_.next_channel_id,
-          state_.partition_id, state_.b);
-      if (padded_hlo.has_value()) {
-        // 3. Slice out the tile from replicate ones.
-        auto shard_shape =
-            MakePartitionedShape(base_shape_, temp_target_sharding);
-        // device assignment within each group is sorted in
-        // HloSharding::PartialTile, thus partiton_id within each group can be
-        // matched with the order in tile_assignment.
-        Array<int64> tiling_assignment(tiling_dim_factors);
-        tiling_assignment.FillIota(0);
-        auto slice =
-            state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
-                shard_shape, padded_hlo.value(),
-                MakePartitionOffsets(padded_hlo.value()->shape(),
-                                     HloSharding::Tile(tiling_assignment),
-                                     per_group_partitioner_state.partition_id,
-                                     per_group_partitioner_state.b),
-                shard_shape.dimensions()));
-        slice->set_sharding(temp_target_sharding);
-        auto result = PartitionedHlo(slice, base_shape_, state_);
-        // If temp_target_sharding's device assignment is different from target,
-        // use collective permute to reshard.
-        if (CanReshardWithCollectivePermute(temp_target_sharding, target)) {
-          return result.ReshardWithCollectivePermute(target);
-        }
-        // If device assignment in temp_target_sharding and target are the same,
-        // return result directly.
-        return result;
-      }
+  if (!target.IsTileMaximal() && sharding().ReplicateOnLastTileDim()) {
+    auto try_reshard = ReshardFromPartialReplicateWithDynamicSlice(target);
+    if (try_reshard.has_value()) {
+      return try_reshard.value();
     }
   }
 
-  // Tiled to partial replicate
-  if (!sharding().ReplicateOnLastTileDim() && !sharding().IsTileMaximal() &&
-      target.ReplicateOnLastTileDim()) {
-    // Get the comptible sharding to target with resharding by all reduce.
-    auto compatible_sharding = PartialReplicateToTileCompatibleSharding(
-        target, sharding().tile_assignment().dimensions());
-    if (compatible_sharding.has_value()) {
-      auto temp_sharding = compatible_sharding.value();
-      auto partitioned_hlo = *this;
-      // Use collective permute to adjust device assignment if needed.
-      if (CanReshardWithCollectivePermute(sharding(), temp_sharding)) {
-        partitioned_hlo =
-            partitioned_hlo.ReshardWithCollectivePermute(temp_sharding);
-      }
-
-      // Get replicate dims and replicate factor of each dimensions.
-      int64 rank = hlo_->shape().rank();
-      std::vector<int64> replicate_dims;
-      std::vector<int64> replicate_factors;
-      for (int64 dim = 0; dim < rank; dim++) {
-        int64 replicate_factor = temp_sharding.tile_assignment().dim(dim) /
-                                 target.tile_assignment().dim(dim);
-        if (replicate_factor > 1) {
-          replicate_dims.emplace_back(dim);
-          replicate_factors.emplace_back(replicate_factor);
-        }
-      }
-
-      // Do left halo exchange if all-reduce directly will remove useful data
-      // from the source.
-      auto halo_exchange = TileToPartialReplicateHaloExchange(
-          partitioned_hlo.hlo_, base_shape_, temp_sharding, target,
-          replicate_dims, partitioned_hlo.state().collective_ops_creator,
-          partitioned_hlo.state().next_channel_id,
-          partitioned_hlo.state().partition_id, partitioned_hlo.state().b);
-      if (halo_exchange.has_value()) {
-        auto halo_exchange_hlo = halo_exchange.value();
-        // Grouped on replicate dimensions.
-        auto sharding_grouped = GroupShardingOnDims(
-            temp_sharding, replicate_dims, replicate_factors);
-        auto per_group_partitioner_state = CreatePerGroupPartitioningState(
-            partitioned_hlo.state(), sharding_grouped.device_groups,
-            partitioned_hlo.state().b);
-        auto base_shape = MakePartitionedShape(base_shape_, target);
-        // It's possible that halo_exchange_hlo == hlo.hlo().
-        // Record the sharding of hlo here, and reset it before return.
-        auto original_sharding = partitioned_hlo.sharding();
-        halo_exchange_hlo->set_sharding(sharding_grouped.sharding);
-        auto partial_replicate_hlo = PartitionedHlo(
-            halo_exchange_hlo, base_shape, per_group_partitioner_state);
-        HloInstruction* result =
-            partial_replicate_hlo.ReplicatePartial(replicate_dims);
-        partitioned_hlo.hlo()->set_sharding(original_sharding);
-        result->set_sharding(target);
-        return PartitionedHlo(result, base_shape_, partitioned_hlo.state());
-      }
+  if (!sharding().IsTileMaximal() && target.ReplicateOnLastTileDim()) {
+    auto try_reshard = ReshardToPartialReplicateWithAllGather(target);
+    if (try_reshard.has_value()) {
+      return try_reshard.value();
     }
   }
 
@@ -794,6 +686,14 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
 }
 
 PartitionedHlo PartitionedHlo::Replicate() {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  if (state_.partitioner->options().cache_all_gather) {
+    for (auto& entry : cache) {
+      if (entry.first.IsReplicated()) {
+        return entry.second;
+      }
+    }
+  }
   const HloSharding& sharding = hlo_->sharding();
   const Shape& shape = hlo_->shape();
   CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
@@ -801,7 +701,6 @@ PartitionedHlo PartitionedHlo::Replicate() {
   if (sharding.IsReplicated()) {
     return *this;
   }
-  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
   for (auto& entry : cache) {
     if (entry.first.IsReplicated()) {
       return entry.second;
@@ -810,8 +709,11 @@ PartitionedHlo PartitionedHlo::Replicate() {
   auto update_cache = [&](PartitionedHlo resharded) {
     state_.reshard_cache->per_hlo_cache[resharded.hlo()]
         .reshard_cache.emplace_back(sharding, *this);
-    cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
-    return cache.back().second;
+    if (state_.partitioner->options().cache_all_gather) {
+      cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
+      return cache.back().second;
+    }
+    return resharded;
   };
   // 'Single Device' to 'Repliated'.
   if (sharding.IsTileMaximal()) {
@@ -872,6 +774,155 @@ HloInstruction* PartitionedHlo::ReplicatePartial(absl::Span<const int64> dims) {
   return result;
 }
 
+absl::optional<PartitionedHlo>
+PartitionedHlo::ReshardToPartialReplicateWithAllGather(
+    const HloSharding& target) {
+  if (!target.ReplicateOnLastTileDim()) {
+    return absl::nullopt;
+  }
+  // Tiled/partial replicate to partial replicate
+  // Get the comptible sharding to target with resharding by all reduce.
+  auto compatible_sharding =
+      PartialReplicateReshardCompatibleSharding(target, sharding());
+  if (!compatible_sharding.has_value()) {
+    return absl::nullopt;
+  }
+
+  const auto& temp_sharding = compatible_sharding.value();
+  auto partitioned_hlo = *this;
+  // Use collective permute to adjust device assignment if needed.
+  if (CanReshardWithCollectivePermute(sharding(), temp_sharding)) {
+    partitioned_hlo =
+        partitioned_hlo.ReshardWithCollectivePermute(temp_sharding);
+  }
+
+  // Get replicate dims and replicate factor of each dimensions.
+  int64 rank = hlo_->shape().rank();
+  std::vector<int64> replicate_dims;
+  std::vector<int64> replicate_factors;
+  for (int64 dim = 0; dim < rank; dim++) {
+    int64 replicate_factor = temp_sharding.tile_assignment().dim(dim) /
+                             target.tile_assignment().dim(dim);
+    if (replicate_factor > 1) {
+      replicate_dims.emplace_back(dim);
+      replicate_factors.emplace_back(replicate_factor);
+    }
+  }
+
+  // Do left halo exchange if all-reduce directly will remove useful data
+  // from the source.
+  auto halo_exchange = TileToPartialReplicateHaloExchange(
+      partitioned_hlo.hlo_, base_shape_, temp_sharding, target, replicate_dims,
+      partitioned_hlo.state().collective_ops_creator,
+      partitioned_hlo.state().next_channel_id,
+      partitioned_hlo.state().partition_id, partitioned_hlo.state().b);
+  if (!halo_exchange.has_value()) {
+    return absl::nullopt;
+  }
+  auto halo_exchange_hlo = halo_exchange.value();
+  // Grouped on replicate dimensions.
+  auto sharding_grouped =
+      GroupShardingOnDims(temp_sharding, replicate_dims, replicate_factors);
+  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+      partitioned_hlo.state(), sharding_grouped.device_groups,
+      partitioned_hlo.state().b);
+  auto base_shape = MakePartitionedShape(base_shape_, target);
+  // It's possible that halo_exchange_hlo == hlo.hlo().
+  // Record the sharding of hlo here, and reset it before return.
+  auto original_sharding = partitioned_hlo.sharding();
+  halo_exchange_hlo->set_sharding(sharding_grouped.sharding);
+  auto partial_replicate_hlo = PartitionedHlo(halo_exchange_hlo, base_shape,
+                                              per_group_partitioner_state);
+  HloInstruction* result =
+      partial_replicate_hlo.ReplicatePartial(replicate_dims);
+  partitioned_hlo.hlo()->set_sharding(original_sharding);
+  result->set_sharding(target);
+  return PartitionedHlo(result, base_shape_, partitioned_hlo.state());
+}
+
+absl::optional<PartitionedHlo>
+PartitionedHlo::ReshardFromPartialReplicateWithDynamicSlice(
+    const HloSharding& target) {
+  if (!sharding().ReplicateOnLastTileDim()) {
+    return absl::nullopt;
+  }
+
+  // Get the temp sharding target from partial replicate to target tile dims.
+  // target_compatible_sharding has the same tile_assignment dimensions
+  // as the target and can reshard to target by collective permute.
+  // target_compatible_sharding could have different device assignment as
+  // targe. sharding() can reshard to target_compatible_sharding by
+  // dynamic slice.
+  auto target_compatible_sharding =
+      PartialReplicateReshardCompatibleSharding(sharding(), target);
+  // Reshard to target_compatible_sharding by dynamic slice.
+  if (!target_compatible_sharding.has_value()) {
+    return absl::nullopt;
+  }
+  std::vector<int64> expand_tile_dims;
+  std::vector<int64> tiling_dim_factors;
+  int64 rank = hlo_->shape().rank();
+  tiling_dim_factors.reserve(target.tile_assignment().num_dimensions());
+  const auto& temp_target_sharding = target_compatible_sharding.value();
+  for (int64 dim = 0; dim < rank; dim++) {
+    if (temp_target_sharding.tile_assignment().dim(dim) >
+        sharding().tile_assignment().dim(dim)) {
+      expand_tile_dims.push_back(dim);
+    }
+    tiling_dim_factors.emplace_back(
+        temp_target_sharding.tile_assignment().dim(dim) /
+        sharding().tile_assignment().dim(dim));
+  }
+
+  // Add another dimension in tiling_dim_factors if target is partial replicate.
+  if (target.ReplicateOnLastTileDim()) {
+    tiling_dim_factors.emplace_back(
+        target.tile_assignment().dimensions().back());
+  }
+
+  // Get per_group partitioner state.
+  std::vector<int64> group_dims(sharding().tile_assignment().num_dimensions() -
+                                1);
+  std::iota(group_dims.begin(), group_dims.end(), 0);
+  auto sharding_grouped = GroupShardingOnDims(sharding(), group_dims);
+  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+      state_, sharding_grouped.device_groups, state_.b);
+  // 2. Get the padded_hlo, do right halo exchange if needed.
+  auto padded_hlo = PadFromPartialReplicateShape(
+      hlo_, base_shape_, sharding(), temp_target_sharding, expand_tile_dims,
+      state_.collective_ops_creator, state_.next_channel_id,
+      state_.partition_id, state_.b);
+  if (!padded_hlo.has_value()) {
+    return absl::nullopt;
+  }
+  // 3. Slice out the tile from replicate ones.
+  auto shard_shape = MakePartitionedShape(base_shape_, temp_target_sharding);
+  // device assignment within each group is sorted in
+  // HloSharding::PartialTile, thus partiton_id within each group can be
+  // matched with the order in tile_assignment.
+  Array<int64> tiling_assignment(tiling_dim_factors);
+  tiling_assignment.FillIota(0);
+  auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo.value(),
+      MakePartitionOffsets(padded_hlo.value()->shape(),
+                           target.ReplicateOnLastTileDim()
+                               ? HloSharding::PartialTile(tiling_assignment)
+                               : HloSharding::Tile(tiling_assignment),
+                           per_group_partitioner_state.partition_id,
+                           per_group_partitioner_state.b),
+      shard_shape.dimensions()));
+  slice->set_sharding(temp_target_sharding);
+  auto result = PartitionedHlo(slice, base_shape_, state_);
+  // If temp_target_sharding's device assignment is different from target,
+  // use collective permute to reshard.
+  if (CanReshardWithCollectivePermute(temp_target_sharding, target)) {
+    return result.ReshardWithCollectivePermute(target);
+  }
+  // If device assignment in temp_target_sharding and target are the same,
+  // return result directly.
+  return result;
+}
+
 PartitionedHlo PartitionedHlo::Broadcast() const {
   const Shape& shape = hlo_->shape();
   const HloSharding& sharding = hlo_->sharding();
@@ -1048,6 +1099,25 @@ PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
     const HloSharding& target) const {
   CHECK(CanReshardWithCollectivePermute(sharding(), target))
       << sharding().ToString() << " to " << target.ToString();
+  if (hlo()->opcode() == HloOpcode::kBroadcast) {
+    // If hlo() is a broadcast, check if data is already the same between
+    // source/destination pairs.
+    std::vector<int64> new_dims;
+    for (int64 i = 0; i < hlo()->shape().rank(); ++i) {
+      if (!absl::c_linear_search(hlo()->dimensions(), i)) {
+        new_dims.push_back(i);
+      }
+    }
+    if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(sharding(),
+                                                                 new_dims) ==
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(target,
+                                                                 new_dims)) {
+      auto copy = state_.b->AddInstruction(
+          HloInstruction::CreateUnary(hlo()->shape(), HloOpcode::kCopy, hlo()));
+      copy->set_sharding(target);
+      return PartitionedHlo(copy, base_shape_, state_);
+    }
+  }
   std::vector<std::pair<int64, int64>> src_dst_pairs;
   sharding().tile_assignment().Each(
       [&](absl::Span<const int64> indices, int64 src_device) {
@@ -1868,6 +1938,16 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     return Status::OK();
   }
 
+  // Check if operand sharding and sharding are both tiled or partial replicate.
+  // If both of them are partial replicate, check num_replications are the same.
+  if (operand.sharding().ReplicateOnLastTileDim() !=
+          sharding.ReplicateOnLastTileDim() ||
+      (sharding.ReplicateOnLastTileDim() &&
+       (operand.sharding().tile_assignment().dimensions().back() !=
+        sharding.tile_assignment().dimensions().back()))) {
+    return DefaultAction(hlo);
+  }
+
   // Try use halo exchange for certain split-dim/merge-dims cases.
   // ReshapeSharding failed in these cases probably due to uneven partitioning,
   // where halo exchange could help. Specifically we check the following
@@ -1903,7 +1983,14 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
   Array<int64> new_input_tile_assignment = sharding.tile_assignment();
   new_input_tile_assignment.Reshape(
       operand.sharding().tile_assignment().dimensions());
-  operand = operand.Reshard(HloSharding::Tile(new_input_tile_assignment));
+  auto aligned_sharding =
+      sharding.ReplicateOnLastTileDim()
+          ? HloSharding::PartialTile(new_input_tile_assignment)
+          : HloSharding::Tile(new_input_tile_assignment);
+  operand = operand.Reshard(aligned_sharding);
+  auto replication_count = sharding.ReplicateOnLastTileDim()
+                               ? sharding.tile_assignment().dimensions().back()
+                               : 1;
 
   int64 input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
   int64 output_dim_size = hlo->shape().dimensions(output_sharded_dim);
@@ -1926,7 +2013,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
       dim->set_padding_low(0);
       if (i == input_sharded_dim) {
         dim->set_padding_high(output_shard_size * split_factor *
-                                  num_partitions_ -
+                                  num_partitions_ / replication_count -
                               input_dim_size);
       } else {
         dim->set_padding_high(0);
@@ -1964,8 +2051,8 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     tmp_reshape->set_sharding(hlo->sharding());
     auto tmp_full_shape = tmp_shard_shape;
     tmp_full_shape.set_dimensions(
-        output_sharded_dim,
-        tmp_shard_shape.dimensions(output_sharded_dim) * num_partitions_);
+        output_sharded_dim, tmp_shard_shape.dimensions(output_sharded_dim) *
+                                num_partitions_ / replication_count);
     auto tmp_output =
         PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
 
@@ -1982,7 +2069,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
       if (i == output_sharded_dim) {
         dim->set_padding_high(output_dim_size -
                               tmp_shard_shape.dimensions(output_sharded_dim) *
-                                  num_partitions_);
+                                  num_partitions_ / replication_count);
       } else {
         dim->set_padding_high(0);
       }
@@ -2605,7 +2692,13 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
                         .Reshard(HloSharding::Replicate())
                         .hlo());
     inputs.push_back(GetPartitionedHlo(hlo->operand(operand_id)));
-    if (operand_id > 0) {
+    if (hlo->shape().IsTuple() && operand_id == 0) {
+      // We cannot do tuple-reduce where partitioned dimensions are reduced.
+      // Partially replicate on those dims.
+      inputs[0] = inputs[0].Reshard(
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              inputs[0].sharding(), hlo->dimensions()));
+    } else {
       // Make sure all operands are sharded in the same way.
       inputs.back() = inputs.back().Reshard(inputs[0].sharding());
     }
@@ -2613,17 +2706,6 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
       inputs.back() = inputs.back().PadWithValue(inits[operand_id]);
     }
   }
-  bool reduce_sharded_dimension = false;
-  if (!inputs[0].sharding().IsTileMaximal()) {
-    reduce_sharded_dimension = absl::c_any_of(hlo->dimensions(), [&](int64 i) {
-      return inputs[0].sharding().tile_assignment().dim(i) > 1;
-    });
-
-    // reduce_sharded_dimension is not supported for tuple-shaped reduces.
-    if (reduce_sharded_dimension && input_count > 1) {
-      return DefaultAction(hlo);
-    }
-  }
 
   std::vector<Shape*> new_operand_shapes(input_count * 2);
   for (int64 i = 0; i < input_count; ++i) {
@@ -2646,6 +2728,11 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
 
   SetPartitionedHlo(hlo, [&]() {
     HloInstruction* reduce = local_reduce;
+    const bool reduce_sharded_dimension =
+        !inputs[0].sharding().IsTileMaximal() &&
+        absl::c_any_of(hlo->dimensions(), [&](int64 i) {
+          return inputs[0].sharding().tile_assignment().dim(i) > 1;
+        });
     if (reduce_sharded_dimension) {
       CHECK(local_reduce->shape().IsArray());
       std::vector<int64> preserved_dims;
@@ -3353,7 +3440,7 @@ StatusOr<bool> SpmdPartitioner::Run(HloModule* module) {
     HloPassPipeline pass("spmd-cleanup");
     pass.AddPass<TupleSimplifier>();
     pass.AddPass<HloDCE>();
-    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pass.AddPass<FlattenCallGraph>();
     TF_RETURN_IF_ERROR(pass.Run(module).status());
   }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index a612c16bdae..6447d08be41 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -47,6 +47,12 @@ struct SpmdPartitionerOptions {
 
   // Whether the entry computations' signature could change after partitioning.
   bool allow_module_signature_change = false;
+
+  // Whether to use cached all-gather to avoid repeatedly replicate a tiled
+  // tensor. If it is set to false, the result tends to be more
+  // memory-efficient, and the compiler can use the ScheduleAwareAllGatherCSE
+  // pass to CSE some all-gathers which are relatively close to each other.
+  bool cache_all_gather = true;
 };
 
 // Class to wrap the computation builder to capture information during SPMD
@@ -180,6 +186,8 @@ class SpmdPartitioner : public HloModulePass {
       int64 channel_id, absl::Span<const int64> selected_dims,
       const SPMDCollectiveOpsCreator& collectives_creator);
 
+  const SpmdPartitionerOptions& options() { return options_; }
+
  protected:
   virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
       HloComputation* computation, int64 num_partitions, int64 num_replicas,
@@ -305,6 +313,14 @@ class PartitionedHlo {
   // Helper function to reshard the tensor using CollectivePermute.
   PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
 
+  // Helper function to reshard to partial replicate using AllGather.
+  absl::optional<PartitionedHlo> ReshardToPartialReplicateWithAllGather(
+      const HloSharding& target);
+
+  // Helper function to reshard from partial replicate using DynamicSlice.
+  absl::optional<PartitionedHlo> ReshardFromPartialReplicateWithDynamicSlice(
+      const HloSharding& target);
+
   // SPMD instruction.
   HloInstruction* hlo_;
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 1dc4c474c49..089c4c339a4 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -138,8 +138,7 @@ ENTRY entry {
               op::AllReduce(op::Select(
                   op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
                   op::Constant(), op::Broadcast())),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                           op::Constant())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
               op::Constant())),
           op::Shape("s32[1,3]")));
 }
@@ -161,8 +160,7 @@ ENTRY entry {
       op::Copy(op::AllReduce(AllOf(
           op::DynamicUpdateSlice(
               op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                           op::Constant())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
               op::Constant()),
           op::Shape("s32[2,3]")))));
 }
@@ -184,8 +182,7 @@ ENTRY entry {
       op::Copy(op::Copy(op::AllReduce(AllOf(
           op::DynamicUpdateSlice(
               op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
-              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                           op::Constant())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
               op::Constant()),
           op::Shape("s32[2,3]"))))));
 }
@@ -279,8 +276,8 @@ ENTRY entry {
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_THAT(root, op::Tuple());
 
-  auto offset = op::Reshape(
-      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
 
   EXPECT_THAT(root->operand(0),
               op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
@@ -305,13 +302,13 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
-      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
-                op::Broadcast(),
-                op::GetTupleElement(
-                    AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
-                op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
-                                             op::Constant())),
-                op::Constant()))));
+      root,
+      op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(),
+          op::GetTupleElement(
+              AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
+          op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
+          op::Constant()))));
 }
 
 TEST_F(SpmdPartitioningTest, UnevenTiledInfeed) {
@@ -2598,6 +2595,79 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicateShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2},
+    sharding={devices=[1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[16,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[16,4,19,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateNonShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2},
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto resahrd = AllOf(op::Reshape(op::Transpose(op::Reshape(op::AllToAll()))),
+                       op::Shape("f32[16,38,38,2]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateMultiDimensionShardedTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[2,2,1,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %transpose = f32[38,4,16,38] transpose(%param0.copy),
+    dimensions={1,3,0,2},
+    sharding={devices=[2,1,2,1,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[8,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[19,4,8,38]")));
+}
+
 TEST_F(SpmdPartitioningTest, ShardableReshape) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2621,6 +2691,30 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicateShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0),
+    sharding={devices=[2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %reshape = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[2,1,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[19,38,324]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
+}
+
 TEST_F(SpmdPartitioningTest, NonShardableReshape) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2673,6 +2767,30 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicateReshapeMergeDimsWithHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[2,3,7,10] parameter(0),
+    sharding={devices=[1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %reshape = s32[3,2,1,14,5] reshape(%input),
+    sharding={devices=[1,1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto reshape =
+      AllOf(op::Reshape(op::Parameter(0)), op::Shape("s32[3,2,1,8,5]"));
+  auto halo = op::CollectivePermute(op::Slice(reshape));
+  auto exchanged =
+      op::DynamicSlice(op::Concatenate(halo, reshape), _, _, _, _, _);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
+}
+
 // Produces an invalid module after transformation.
 TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
   const char* const hlo_string = R"(
@@ -2831,6 +2949,48 @@ ENTRY %main {
                           op::Shape("(f32[14], s32[14])")));
 }
 
+TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  ROOT %reduce = (f32[28], s32[28]) reduce(%param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func,
+    sharding={{devices=[2,2]0,1,2,3 last_tile_dim_replicate},
+              {devices=[2,2]0,1,2,3 last_tile_dim_replicate}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs =
+      AllOf(op::Shape("f32[14,10]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, op::Parameter(0), _, _)));
+  auto rhs =
+      AllOf(op::Shape("s32[14,10]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, op::Parameter(1), _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Reduce(lhs, rhs, op::Parameter(2), op::Parameter(3)),
+                    op::Shape("(f32[14], s32[14])")));
+}
+
 TEST_F(SpmdPartitioningTest, TiledToTiledReduceOutputReshard) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3793,8 +3953,8 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   VLOG(1) << module->ToString();
-  auto offset = op::Reshape(
-      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
   auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
   auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
                    op::Shape("s32[2,3]"));
@@ -3930,8 +4090,8 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   VLOG(1) << module->ToString();
-  auto offset = op::Reshape(
-      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
   auto indices = op::Subtract(
       op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
   HloInstruction* root = module->entry_computation()->root_instruction();
@@ -4119,7 +4279,7 @@ HloModule module
 
 ENTRY entry {
   %lhs = f32[48,12] parameter(0), sharding={devices=[2,2]0,1,2,3}
-  %rhs = f32[32,12] parameter(1), sharding={devices=[2,2]0,1,2,3}
+  %rhs = f32[32,12] parameter(1), sharding={devices=[2,2]0,2,1,3}
   ROOT %dot = f32[48,32] dot(%lhs, %rhs),
     lhs_batch_dims={}, rhs_batch_dims={},
     lhs_contracting_dims={1}, rhs_contracting_dims={1},
@@ -4136,8 +4296,8 @@ ENTRY entry {
             op::AllReduce(op::DynamicUpdateSlice(_, lhs, _, _)));
   auto rhs = AllOf(op::Shape("f32[16,6]"), op::Parameter(1));
   auto partial_replicated_rhs =
-      AllOf(op::Shape("f32[16,12]"), op::AllReduce(op::DynamicUpdateSlice(
-                                         _, op::CollectivePermute(rhs), _, _)));
+      AllOf(op::Shape("f32[16,12]"),
+            op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               AllOf(op::Dot(partial_replicated_lhs, partial_replicated_rhs),
@@ -4429,6 +4589,33 @@ ENTRY entry {
   EXPECT_THAT(root, op::AllReduce(dot));
 }
 
+TEST_F(SpmdPartitioningTest, DotPartialContracting3) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,100] parameter(0),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %rhs = f32[32,100] parameter(1),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %dot = f32[24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,50]"), op::Parameter(0));
+  auto rhs =
+      AllOf(op::Shape("f32[16,50]"), op::DynamicSlice(op::Parameter(1), _, _));
+  auto dot = AllOf(op::Shape("f32[24,16]"), op::Dot(lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::CollectivePermute(op::AllReduce(dot)));
+}
+
 TEST_F(SpmdPartitioningTest, DotBatchAndPartialContracting) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4484,6 +4671,119 @@ ENTRY entry {
   EXPECT_THAT(root, dot);
 }
 
+TEST_F(SpmdPartitioningTest, DotPartialNonContractingPartialMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0), sharding={devices=[2,2,1]0,1,2,3}
+  %rhs = f32[32,100] parameter(1),
+    sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,8,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
+    sharding={devices=[2,1,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,4,100]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[16,100]"), op::Parameter(1));
+  auto partially_replicated_lhs = AllOf(
+      op::Shape("f32[12,8,100]"),
+      op::AllReduce(op::DynamicUpdateSlice(op::Broadcast(_), lhs, _, _, _)));
+  auto dot =
+      AllOf(op::Shape("f32[12,8,16]"), op::Dot(partially_replicated_lhs, rhs));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, dot);
+}
+
+TEST_F(SpmdPartitioningTest, DotPartialContractingPartialMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0), sharding={devices=[1,2,2]0,1,2,3}
+  %rhs = f32[32,8,100] parameter(1),
+    sharding={devices=[1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,32] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1,2}, rhs_contracting_dims={1,2},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[24,4,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[32,8,50]"), op::Parameter(1));
+  auto dot = AllOf(op::Shape("f32[24,32]"),
+                   op::Dot(lhs, AllOf(op::Shape("f32[32,4,50]"),
+                                      op::DynamicSlice(rhs, _, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::AllReduce(op::AllReduce(dot)));
+}
+
+TEST_F(SpmdPartitioningTest, DotNonContractingPartialMatchContractingMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,100] parameter(0), sharding={devices=[2,1,2]0,1,2,3}
+  %rhs = f32[100,50] parameter(1), sharding={devices=[2,2]0,2,1,3}
+  ROOT %dot = f32[24,8,50] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,8,50]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[50,25]"), op::Parameter(1));
+  auto dot = AllOf(
+      op::Shape("f32[12,8,50]"),
+      op::Dot(lhs, AllOf(op::Shape("f32[50,50]"),
+                         op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[12,4,50]"),
+                          op::DynamicSlice(op::AllReduce(dot), _, _, _)))
+      << module->ToString();
+}
+
+TEST_F(SpmdPartitioningTest, DotLHSMutiNonContractingRHSNotMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,8,10] parameter(0), sharding={devices=[2,2,1]0,1,2,3}
+  %rhs = f32[10,50] parameter(1),
+    sharding={devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %dot = f32[24,8,50] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto lhs = AllOf(op::Shape("f32[12,4,10]"), op::Parameter(0));
+  auto rhs = AllOf(op::Shape("f32[5,50]"), op::Parameter(1));
+  auto dot = AllOf(
+      op::Shape("f32[12,4,50]"),
+      op::Dot(lhs, AllOf(op::Shape("f32[10,50]"),
+                         op::AllReduce(op::DynamicUpdateSlice(_, rhs, _, _)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, dot) << module->ToString();
+}
+
 TEST_F(SpmdPartitioningTest,
        ElementwiseTest_PartialReplicateToTiledHaloExchange) {
   const char* const hlo_string = R"(
@@ -4531,6 +4831,266 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]"), op::Add(add_lhs, add_rhs)));
 }
 
+TEST_F(SpmdPartitioningTest, TileToPartialReplicateReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[2,2]0,1,2,3}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Reshape())));
+  auto partially_replicated = AllOf(
+      op::Shape("f32[4,8]"), op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                                 op::Broadcast(_), tiled, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateToTileReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Constant())));
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(partially_replicated,
+                                               op::Constant(), op::Reshape())));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshard_AllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(param0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Reshape())));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"),
+            op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(_), partially_replicated_init, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshard_DynamicSlice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[4,8]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Constant())));
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(partially_replicated,
+                                               op::Constant(), op::Reshape())));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshard_DynamicSlice2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[1,1,8]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,8]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Constant())));
+  auto tiled = AllOf(op::Shape("f32[4,4]"),
+                     op::Copy(op::DynamicSlice(partially_replicated,
+                                               op::Reshape(), op::Reshape())));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardWithCollectivePermute) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(param0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[4,4]"),
+            op::CollectivePermute(op::Copy(op::DynamicSlice(
+                op::Parameter(0), op::Reshape(), op::Reshape()))));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,4]"),
+            op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(_), partially_replicated_init, _, _))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardCollectivePermute1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[8,8] parameter(0)
+  %copy = f32[8,8] copy(%param0),
+    sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[8,8] copy(%copy),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto partially_replicated =
+      AllOf(op::Shape("f32[8,4]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Reshape())));
+  auto tiled =
+      AllOf(op::Shape("f32[4,4]"),
+            op::Copy(op::CollectivePermute(op::DynamicSlice(
+                partially_replicated, op::Reshape(), op::Constant()))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, tiled);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[6,3] parameter(0)
+  %copy = f32[6,3] copy(param0),
+    sharding={devices=[4,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[6,3] copy(%copy),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[2,3]"),
+            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(0), op::Constant()),
+                                      op::Reshape(), op::Constant())));
+  auto slice =
+      AllOf(op::Shape("f32[2,3]"),
+            op::DynamicSlice(op::Concatenate(op::CollectivePermute(op::Slice(
+                                                 partially_replicated_init)),
+                                             partially_replicated_init),
+                             _, _));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[3,3]"),
+            op::Copy(op::Slice(op::AllReduce(
+                op::DynamicUpdateSlice(op::Broadcast(_), slice, _, _)))));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateToPartialReplicateReshardHaloExchange1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[6,3] parameter(0)
+  %copy = f32[6,3] copy(param0),
+    sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %copy0 = f32[6,3] copy(%copy),
+    sharding={devices=[4,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  VLOG(1) << module->ToString();
+  auto partially_replicated_init =
+      AllOf(op::Shape("f32[3,3]"),
+            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                      op::Constant())));
+  auto slice = AllOf(
+      op::Shape("f32[4,3]"),
+      op::DynamicSlice(op::Pad(op::Concatenate(partially_replicated_init,
+                                               op::CollectivePermute(op::Slice(
+                                                   partially_replicated_init))),
+                               op::Constant()),
+                       _, _));
+  auto partially_replicated =
+      AllOf(op::Shape("f32[2,3]"), op::Copy(op::DynamicSlice(slice, _, _)));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, partially_replicated);
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index da2a3a44405..0edbd4f2b8d 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
@@ -202,13 +203,17 @@ std::vector<HloInstruction*> MakePartitionOffsets(
     absl::Span<const int64> dims) {
   CHECK(!shape.IsTuple());
 
-  Array2D<int32> offset_array(
-      {sharding.tile_assignment().num_elements(), shape.rank()});
-  offset_array.Each([&](int64 i, int64 j, int32* value) {
-    *value = sharding.TileOffsetForDevice(shape, i)[j];
-  });
-  auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2FromArray2D(offset_array)));
+  std::vector<std::vector<int32>> offset_arrays(shape.rank());
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    offset_arrays[i].resize(sharding.tile_assignment().num_elements());
+  }
+  auto shard_shape = MakePartitionedShape(shape, sharding);
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        for (int64 i = 0; i < shape.rank(); ++i) {
+          offset_arrays[i][device] = indices[i] * shard_shape.dimensions(i);
+        }
+      });
   std::vector<HloInstruction*> offsets;
   for (int64 i = 0; i < shape.rank(); ++i) {
     if (sharding.tile_assignment().dim(i) == 1 ||
@@ -216,11 +221,10 @@ std::vector<HloInstruction*> MakePartitionOffsets(
       offsets.push_back(b->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
     } else {
+      auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR1<int32>(offset_arrays[i])));
       auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(S32, {1, 1}), offset_table,
-          {partition_id, b->AddInstruction(HloInstruction::CreateConstant(
-                             LiteralUtil::CreateR0<uint32>(i)))},
-          {1, 1}));
+          ShapeUtil::MakeShape(S32, {1}), offset_table, {partition_id}, {1}));
       offsets.push_back(b->AddInstruction(
           HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
     }
@@ -292,17 +296,29 @@ HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
   return PadToShape(hlo, padded_base_shape, b);
 }
 
-// TODO(wangtao): generize this function when target is partial replicate.
-absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
-    const HloSharding& partial_sharding,
-    const std::vector<int64>& target_tile_dims) {
+absl::optional<HloSharding> PartialReplicateReshardCompatibleSharding(
+    const HloSharding& partial_sharding, const HloSharding& target_sharding) {
   if (!partial_sharding.ReplicateOnLastTileDim()) {
     return absl::nullopt;
   }
   int64 rank = partial_sharding.tile_assignment().num_dimensions() - 1;
-  if (target_tile_dims.size() < rank) {
+  int64 target_rank = target_sharding.tile_assignment().num_dimensions() -
+                      (target_sharding.ReplicateOnLastTileDim() ? 1 : 0);
+  if (target_rank != rank) {
     return absl::nullopt;
   }
+
+  absl::flat_hash_map<int64, int64> device_to_replication_group;
+  partial_sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        int64 gid = 0;
+        for (int64 i = 0; i < rank; ++i) {
+          gid *= partial_sharding.tile_assignment().dim(i);
+          gid += indices[i];
+        }
+        device_to_replication_group[device] = gid;
+      });
+
   // A dimension is expanded when target_tile_size > partial_tile_size and
   // target_tile_size % partial_tile_size == 0.
   // expand_tile_dims_positions is the index of the expand_dim.
@@ -312,7 +328,7 @@ absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
   int num_expand_dims = 0;
   for (int64 dim = 0; dim < rank; dim++) {
     int64 partial_tile_size = partial_sharding.tile_assignment().dim(dim);
-    int64 target_tile_size = target_tile_dims[dim];
+    int64 target_tile_size = target_sharding.tile_assignment().dim(dim);
     if (target_tile_size % partial_tile_size != 0 ||
         target_tile_size < partial_tile_size) {
       return absl::nullopt;
@@ -325,14 +341,26 @@ absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
   }
 
   // Reshape the partial replicate tile_dimensions.
+  int64 num_target_replication = 1;
+  if (target_sharding.ReplicateOnLastTileDim()) {
+    num_target_replication =
+        target_sharding.tile_assignment().dimensions().back();
+  }
   auto reshape_dimensions = partial_sharding.tile_assignment().dimensions();
   int64 num_replication = reshape_dimensions.back();
-  if (num_replication != Product(expand_tile_sizes)) {
+  if (num_replication / num_target_replication != Product(expand_tile_sizes) ||
+      num_replication % num_target_replication != 0) {
     return absl::nullopt;
   }
+
   reshape_dimensions.pop_back();
   reshape_dimensions.insert(reshape_dimensions.end(), expand_tile_sizes.begin(),
                             expand_tile_sizes.end());
+
+  if (target_sharding.ReplicateOnLastTileDim()) {
+    reshape_dimensions.push_back(num_target_replication);
+  }
+
   auto reshape_tile_assignment = partial_sharding.tile_assignment();
   reshape_tile_assignment.Reshape(reshape_dimensions);
 
@@ -346,13 +374,31 @@ absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
     }
   }
   auto transpose_sharding = hlo_sharding_util::TransposeSharding(
-      HloSharding::Tile(reshape_tile_assignment), perm);
+      target_sharding.ReplicateOnLastTileDim()
+          ? HloSharding::PartialTile(reshape_tile_assignment)
+          : HloSharding::Tile(reshape_tile_assignment),
+      perm);
 
   // Reshape to target shape
   auto transpose_tile_assignment = transpose_sharding.tile_assignment();
-  transpose_tile_assignment.Reshape(target_tile_dims);
+  transpose_tile_assignment.Reshape(
+      target_sharding.tile_assignment().dimensions());
 
-  return HloSharding::Tile(transpose_tile_assignment);
+  bool groups_matching = true;
+  target_sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 device) {
+        if (device_to_replication_group[device] !=
+            device_to_replication_group[transpose_tile_assignment(indices)]) {
+          groups_matching = false;
+        }
+      });
+
+  if (groups_matching) {
+    return target_sharding;
+  }
+  return target_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(transpose_tile_assignment)
+             : HloSharding::Tile(transpose_tile_assignment);
 }
 
 absl::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
@@ -581,7 +627,10 @@ absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
     return absl::nullopt;
   }
   int64 dim = -1;
-  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+  int64 rank = sharding.ReplicateOnLastTileDim()
+                   ? sharding.tile_assignment().num_dimensions() - 1
+                   : sharding.tile_assignment().num_dimensions();
+  for (int64 i = 0; i < rank; ++i) {
     if (sharding.tile_assignment().dim(i) > 1) {
       if (dim != -1) {
         return absl::nullopt;
@@ -1403,7 +1452,7 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
   }
   for (int64 i = 0; i < grouped_sharding.group_dims.size(); ++i) {
     int64 dim = grouped_sharding.group_dims[i];
-    tiling_dims[dim] = grouped_sharding.group_dim_sizes[i];
+    tiling_dims[dim] *= grouped_sharding.group_dim_sizes[i];
   }
   Array<int64> tiling(tiling_dims);
   grouped_tiling.Each([&](absl::Span<const int64> indices, int64 device) {
@@ -1411,9 +1460,12 @@ HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
     for (int64 g = 0; g < grouped_sharding.device_groups.size(); ++g) {
       int64 remaining_group_index = g;
       for (int64 i = grouped_sharding.group_dims.size() - 1; i >= 0; --i) {
-        ungrouped_inds[grouped_sharding.group_dims[i]] =
-            remaining_group_index % grouped_sharding.group_dim_sizes[i];
-        remaining_group_index /= grouped_sharding.group_dim_sizes[i];
+        int64 dim = grouped_sharding.group_dims[i];
+        int64 groups_in_this_dim = grouped_sharding.group_dim_sizes[i];
+        ungrouped_inds[dim] = (remaining_group_index % groups_in_this_dim) *
+                                  grouped_tiling.dim(dim) +
+                              indices[dim];
+        remaining_group_index /= groups_in_this_dim;
       }
       tiling(ungrouped_inds) = grouped_sharding.device_groups[g][device];
     }
@@ -1684,5 +1736,47 @@ absl::optional<HloOpcode> ParseReductionComputation(
   return root->opcode();
 }
 
+absl::optional<std::vector<int64>> FindMatchingPartitionedDimsForGrouping(
+    const HloSharding& sharding,
+    const std::vector<std::vector<int64>>& device_groups) {
+  if (sharding.NumTiles() < device_groups.size() || device_groups.size() < 2 ||
+      device_groups[0].size() < 2) {
+    return absl::nullopt;
+  }
+  int64 rank = sharding.tile_assignment().num_dimensions();
+  if (sharding.ReplicateOnLastTileDim()) {
+    rank--;
+  }
+  absl::flat_hash_map<int64, std::vector<int64>> device_to_index;
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> index, int64 device) {
+        device_to_index[device] =
+            std::vector<int64>(index.begin(), index.begin() + rank);
+      });
+  std::vector<int64> dims;
+  int64 group_count = 1;
+  for (int64 i = 0; i < rank; ++i) {
+    if (device_to_index[device_groups[0][0]][i] ==
+        device_to_index[device_groups[0][1]][i]) {
+      dims.push_back(i);
+      group_count *= sharding.tile_assignment().dim(i);
+    }
+  }
+  if (group_count != device_groups.size()) {
+    return absl::nullopt;
+  }
+  for (const auto& group : device_groups) {
+    for (int64 i = 1; i < group.size(); ++i) {
+      if (absl::c_any_of(dims, [&](const int64 dim) {
+            return device_to_index[group[i]][dim] !=
+                   device_to_index[group[0]][dim];
+          })) {
+        return absl::nullopt;
+      }
+    }
+  }
+  return dims;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 69ed90a4b66..f6f15481b55 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -356,8 +356,8 @@ absl::optional<HloInstruction*> PadFromPartialReplicateShape(
     const SPMDCollectiveOpsCreator& collective_ops_creator,
     int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
 
-// Get the compatible sharding from a partial replicate sharding to a given
-// target tile dimensions.
+// Get the compatible sharding from a partial replicate sharding to a desired
+// target tiled sharding.
 // Compatible means replicate sharding can transform to the target tile
 // dimensions by dynamic slice.
 // For example, if partial_sharding is
@@ -366,9 +366,9 @@ absl::optional<HloInstruction*> PadFromPartialReplicateShape(
 // sharding={devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}.
 // If patial replicate sharding is not partial replicate or can't reshard to
 // target_tile_dims by dynamic slice, return absl::nullopt.
-absl::optional<HloSharding> PartialReplicateToTileCompatibleSharding(
-    const HloSharding& partial_sharding,
-    const std::vector<int64>& target_tile_dims);
+// If target_sharding is already compatible, returns it.
+absl::optional<HloSharding> PartialReplicateReshardCompatibleSharding(
+    const HloSharding& partial_sharding, const HloSharding& target_sharding);
 
 // Do left halo exchange if all-reduce directly from tile sharding to partial
 // replicate sharding will remove useful data from the source.
@@ -379,6 +379,12 @@ absl::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
     const SPMDCollectiveOpsCreator& collective_ops_creator,
     int64* next_channel_id, HloInstruction* partition_id, SpmdBuilder* b);
 
+// Finds a list of dimensions that can be grouped on such that it will have the
+// specified device groups. Group order and dimension order are ignored.
+absl::optional<std::vector<int64>> FindMatchingPartitionedDimsForGrouping(
+    const HloSharding& sharding,
+    const std::vector<std::vector<int64>>& device_groups);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index c66f9d96a50..e2b977ad493 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -333,10 +333,10 @@ TEST_F(TuplePointsToAnalysisTest, CopyStartAndCopyDone) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto copy_start = builder.AddInstruction(HloInstruction::CreateUnary(
+  auto copy_start = builder.AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({constant->shape(), constant->shape(),
                                  ShapeUtil::MakeShape(U32, {})}),
-      HloOpcode::kCopyStart, constant));
+      constant));
   auto copy_done = builder.AddInstruction(HloInstruction::CreateUnary(
       constant->shape(), HloOpcode::kCopyDone, copy_start));
 
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index b4982f1d8e4..64c9635f335 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -61,6 +61,10 @@ class ShapeLayout {
   // Returns the shape (with layouts).
   const Shape& shape() const { return shape_; }
 
+  // Clear dynamic dimensions of this module. Pretending the module creates
+  // static results. Useful in inspecting full outputs when testing.
+  void ClearDynamicShape() { shape_.clear_dynamic_dimensions(); }
+
   // Checks that a layout is set for the shape, and returns a reference to the
   // layout directly on the shape. Shape must not be a tuple.
   const Layout& layout() const;
diff --git a/tensorflow/compiler/xla/tests/dynamism_inference_test.cc b/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
index ba4092def16..a7e032448e0 100644
--- a/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamism_inference_test.cc
@@ -104,12 +104,26 @@ TEST_F(DynamismInferenceTest, ScalarInt32Literal) {
   }
 }
 
+TEST_F(DynamismInferenceTest, TupleSimple) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+
+    auto tuple = Tuple(&b, {c, p});
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple, &b, {0}).ValueOrDie(),
+              false);
+    EXPECT_EQ(ComputeDynamismScalar(client, tuple, &b, {1}).ValueOrDie(), true);
+  }
+}
+
 TEST_F(DynamismInferenceTest, TupleGteKeepsDynamism) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
     auto c = ConstantR0<int32>(&b, 42);
-    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
 
     auto tuple = Tuple(&b, {c, p});
     auto gte0 = GetTupleElement(tuple, 0);
@@ -122,12 +136,25 @@ TEST_F(DynamismInferenceTest, TupleGteKeepsDynamism) {
   }
 }
 
+TEST_F(DynamismInferenceTest, PredValueUsedTwice) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto c = ConstantR0<int32>(&b, 42);
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
+    auto pred = Eq(c, p);
+    auto result = Select(pred, p, c);
+    EXPECT_EQ(ComputeDynamismScalar(client, result, &b, {}).ValueOrDie(),
+              false);
+  }
+}
+
 TEST_F(DynamismInferenceTest, ConcatSliceReshapeKeepsDynamism) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
     auto c = ConstantR0<int32>(&b, 42);
-    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
 
     auto concat = ConcatScalars(&b, {c, p});
     auto slice0 = SliceInDim(concat, 0, 1, 1, 0);
@@ -146,7 +173,7 @@ TEST_F(DynamismInferenceTest, ParameterIsDynamic) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
-    auto computation = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+    auto computation = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
 
     auto value = ComputeDynamismScalar(client, computation, &b);
     ASSERT_TRUE(value.ok()) << value.status();
@@ -160,7 +187,7 @@ TEST_F(DynamismInferenceTest, UnaryOpKeepsDynamism) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
     auto c = ConstantR0<int32>(&b, 42);
-    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
 
     auto neg0 = Neg(c);
     auto neg1 = Neg(p);
@@ -177,7 +204,7 @@ TEST_F(DynamismInferenceTest, BinaryOpsOrsDynamism) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
     auto c = ConstantR0<int32>(&b, 42);
-    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "0");
+    auto p = Parameter(&b, 0, ShapeUtil::MakeScalarShape(S32), "p0");
 
     // Static value + static value = static
     auto add1 = Add(c, c);
@@ -198,8 +225,8 @@ TEST_F(DynamismInferenceTest, GetDimensionSize) {
     // param = Param([<=2, 3])
     // get_dimension_size(param, 0) is dynamic
     // get_dimension_size(param, 1) is static
-    auto p =
-        Parameter(&b, 0, ShapeUtil::MakeShape(S32, {2, 3}, {true, false}), "0");
+    auto p = Parameter(&b, 0, ShapeUtil::MakeShape(S32, {2, 3}, {true, false}),
+                       "p0");
 
     auto gds0 = GetDimensionSize(p, 0);
     auto gds1 = GetDimensionSize(p, 1);
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
index 09c91d4be14..dca8e31e792 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
@@ -123,8 +123,16 @@ BINARY_TEST_16BIT(Min, {
 })
 
 // TODO(bixia): Pow fails with bfloat16 on CPU.
-BINARY_TEST_16BIT(DISABLED_ON_CPU(Pow),
-                  { Run(AddEmptyBroadcastDimension(Pow), std::pow); })
+BINARY_TEST_16BIT(DISABLED_ON_CPU(Pow), {
+  // See b/162664705.
+  known_incorrect_fn_ = [](int64 val) {
+    Eigen::bfloat16 f;
+    uint16_t val_16 = val;
+    memcpy(&f, &val_16, 2);
+    return std::isnan(f);
+  };
+  Run(AddEmptyBroadcastDimension(Pow), std::pow);
+})
 
 // TODO(bixia): Atan2 fails with bfloat16 on CPU.
 BINARY_TEST_16BIT(DISABLED_ON_CPU(Atan2),
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 0fd5f191db0..0f8a4c1e273 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -711,6 +711,24 @@ ENTRY main {
   RunTest(hlo_text, &operand, &start_indices);
 }
 
+XLA_TEST_F(GatherOperationTest, GatherFromScalarNonZeroIndices) {
+  const string hlo_text = R"(
+HloModule GatherFromScalar
+
+ENTRY main {
+  operand = f32[1,1,1] parameter(0)
+  indices = s32[2,3,50] parameter(1)
+  ROOT gather = f32[1,2,50] gather(operand, indices),
+      offset_dims={0},
+      collapsed_slice_dims={0,1},
+      start_index_map={1,0,2},
+      index_vector_dim=1,
+      slice_sizes={1,1,1}
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0, 0}));
+}
+
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
 // Disabled on interpreter since ExecuteAsyncOnStream is not supported.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d0b6e5f80ed..663e7d81006 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -230,6 +230,19 @@ StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
                                         device_assignment);
 }
 
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::function<Executable*(int64)> executable_provider,
+    std::function<int64(int64)> argument_count_provider,
+    std::function<const Literal*(int64, int64)> argument_provider,
+    int64 num_replicas, bool run_hlo_passes) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = true;
+  return test_runner_.ExecuteReplicated(
+      executable_provider, argument_count_provider, argument_provider, options);
+}
+
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
     const HloModule& test_module,
     const std::function<void(HloModule*)>& reference_preprocessor) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 17c2a55ba5b..fc680e39682 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -169,6 +169,13 @@ class HloTestBase : public ManifestCheckingTest {
       int64 num_replicas, DeviceAssignment* device_assignment,
       bool run_hlo_passes, bool use_threads);
 
+  // Same as above, but allows passing different programs for replicas.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::function<Executable*(int64)> executable_provider,
+      std::function<int64(int64)> argument_count_provider,
+      std::function<const Literal*(int64, int64)> argument_provider,
+      int64 num_replicas, bool run_hlo_passes);
+
   // Executes the given hlo module on two backends and compares results.
   //
   // 'arguments': the input of the hlo module.
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 1fbce96625b..4034e5fdd27 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -31,10 +31,10 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numbers.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b9fe544783c..e45e0000017 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -318,7 +318,6 @@ alias(
 cc_library(
     name = "lib_proto_parsing",
     hdrs = [
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_proto_parsing_headers",
         "//tensorflow/core/lib/strings:legacy_lib_proto_parsing_headers",
         "//tensorflow/core/platform:lib_proto_parsing_hdrs",
@@ -328,7 +327,6 @@ cc_library(
         ":platform_base",
         "@com_google_absl//absl/strings",
         "@double_conversion//:double-conversion",
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/lib/core:errors",
         "//tensorflow/core/lib/core:stringpiece",
         "//tensorflow/core/lib/core:status",
@@ -353,6 +351,7 @@ cc_library(
 cc_library(
     name = "lib",
     hdrs = [
+        # TODO(rmlarsen): Remove bfloat16.h once dependency in third_party/swift is updated.
         "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_headers",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_headers",
@@ -582,7 +581,6 @@ cc_library(
         "//tensorflow/core/framework:numeric_types.h",
         "//tensorflow/core/framework:tensor_types.h",
         "//tensorflow/core/framework:type_traits.h",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/platform:framework_lite_hdrs",
         "//tensorflow/core/platform/default:integral_types.h",
         "//tensorflow/core/platform/default:logging.h",
@@ -593,7 +591,6 @@ cc_library(
             "@nsync//:nsync_cpp",
         ] + [
             "//third_party/eigen3",
-            "//tensorflow/core/lib/bfloat16",
             "//tensorflow/core/platform:dynamic_annotations",
             "//tensorflow/core/platform:platform_port",
             "//tensorflow/core/platform:thread_annotations",
@@ -1014,6 +1011,7 @@ cc_library(
         "//tensorflow/core/kernels:grappler",
         "//tensorflow/core/kernels:histogram_op",
         "//tensorflow/core/kernels:io",
+        "//tensorflow/core/kernels:isotonic_regression_op",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:manip",
@@ -1258,7 +1256,6 @@ filegroup(
         "//tensorflow/core/example:mobile_srcs_no_runtime",
         "//tensorflow/core/framework:attr_value_proto_text_srcs",
         "//tensorflow/core/framework:mobile_srcs_no_runtime",
-        "//tensorflow/core/lib/bfloat16:mobile_srcs_no_runtime",
         "//tensorflow/core/lib/core:mobile_srcs_no_runtime",
         "//tensorflow/core/lib/gtl:mobile_srcs_no_runtime",
         "//tensorflow/core/lib/hash:mobile_srcs_no_runtime",
@@ -1696,7 +1693,6 @@ filegroup(
         "//tensorflow/core/framework:resource_handle.h",
         "//tensorflow/core/platform:legacy_lib_internal_headers",
         "//tensorflow/core/platform:lib_internal_private_hdrs",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_all_headers",
         "//tensorflow/core/lib/gtl:legacy_lib_gtl_all_headers",
         "//tensorflow/core/lib/histogram:legacy_lib_histogram_all_headers",
@@ -1813,7 +1809,6 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/lib/core:arena",
         "//tensorflow/core/lib/core:bitmap",
         "//tensorflow/core/lib/core:blocking_counter",
@@ -1894,6 +1889,7 @@ cc_library(
         "//tensorflow/core/lib/strings:strcat",
         "//tensorflow/core/lib/strings:stringprintf",
         "//tensorflow/core/platform:abi",
+        "//tensorflow/core/platform:bfloat16",
         "//tensorflow/core/platform:base64",
         "//tensorflow/core/platform:blocking_counter",
         "//tensorflow/core/platform:casts",
@@ -2021,7 +2017,6 @@ alias(
 cc_library(
     name = "tflite_portable_logging",
     hdrs = [
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/platform:tflite_portable_logging_hdrs",
         "//tensorflow/core/platform/default:integral_types.h",
         "//tensorflow/core/platform/default:logging.h",
@@ -2051,7 +2046,6 @@ cc_library(
     hdrs = [
         "lib/jpeg/jpeg_handle.h",
         "lib/jpeg/jpeg_mem.h",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
         "//tensorflow/core/platform:jpeg_internal_hdrs",
         "//tensorflow/core/platform/default:integral_types.h",
@@ -2078,7 +2072,6 @@ cc_library(
     ]),
     hdrs = [
         "lib/gif/gif_io.h",
-        "//tensorflow/core/lib/bfloat16:bfloat16.h",
         "//tensorflow/core/lib/core:legacy_lib_core_stringpiece_header",
         "//tensorflow/core/lib/gtl:legacy_android_gif_internal_headers",
         "//tensorflow/core/platform:gif_internal_hdrs",
@@ -2969,6 +2962,8 @@ filegroup(
     srcs = [
         # PNG data
         "//tensorflow/core/lib/png:testdata",
+        "//tensorflow/core/lib/ssim:testdata",
+        "//tensorflow/core/lib/psnr:testdata",
         # JPEG data
         "lib/jpeg/testdata/jpeg_merge_test1.jpg",
         "lib/jpeg/testdata/jpeg_merge_test1_cmyk.jpg",
@@ -2998,13 +2993,6 @@ filegroup(
         "lib/bmp/testdata/grayscale_small.bmp",
         "lib/bmp/testdata/grayscale_small_3channels.bmp",
         "lib/bmp/testdata/grayscale_small_4channels.bmp",
-        # SSIM, PSNR data
-        "lib/ssim/testdata/checkerboard1.png",
-        "lib/ssim/testdata/checkerboard2.png",
-        "lib/ssim/testdata/checkerboard3.png",
-        "lib/psnr/testdata/cat_q20.jpg",
-        "lib/psnr/testdata/cat_q72.jpg",
-        "lib/psnr/testdata/cat_q95.jpg",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
index d87c088899e..5e736078f18 100644
--- a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
@@ -24,8 +24,27 @@ END
 destination data format.
 END
   }
-  summary: "Returns the permuted vector/tensor in the destination data format given the"
+  summary: "Permute input tensor from `src_format` to `dst_format`."
   description: <<END
-one in the source data format.
+Input tensor must be a vector of size 4, or a 4x2 tensor.
+
+For example, with `src_format` of `NHWC`, `dst_format` of `NCHW`, and inputs:
+```
+[1, 2, 3, 4]
+```
+and
+```
+[[1, 2, 3, 4],
+ [5, 6, 7, 8]]
+```
+, the outputs will be (respectively):
+```
+[1, 4, 2, 3]
+```
+and
+```
+[[1, 4, 2, 3],
+ [5, 8, 6, 7]]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
new file mode 100644
index 00000000000..3a737420005
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsotonicRegression.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "IsotonicRegression"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+A (batch_size, dim)-tensor holding a batch of inputs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A (batch_size, dim)-tensor holding the per-batch element solutions.
+END
+  }
+    out_arg {
+    name: "segments"
+    description: <<END
+An int32 (batch_size, dim)-tensor with the segments.
+END
+  }
+  attr { name: "output_dtype"  description: "Dtype of output." }
+  summary: "Solves a batch of isotonic regression problems."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
index 8245f7d300c..018326c3ad3 100644
--- a/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
@@ -26,9 +26,9 @@ one or more square matrices.
 The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
 form square matrices. The outputs are two tensors containing the signs and
 absolute values of the log determinants for all N input submatrices
-`[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-is the LU decomposition of the input and P is the corresponding
+`[..., :, :]` such that `determinant = sign*exp(log_abs_determinant)`.
+The `log_abs_determinant` is computed as `det(P)*sum(log(diag(LU)))` where `LU`
+is the `LU` decomposition of the input and `P` is the corresponding
 permutation matrix.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
index 1ca7ae081a3..a8356944eca 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
@@ -1,3 +1,10 @@
 op {
   graph_op_name: "XlaRecvFromHost"
+  summary: "An op to receive a tensor from the host."
+  description: <<END
+output: the tensor that will be received from the host.
+Toutput: element type for output.
+shape: shape for output.
+key: A unique identifier for this region used to match up host transfers.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
index ef6f5e22fc0..4ef9c562542 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
@@ -9,4 +9,10 @@ op {
   attr {
     name: "key"
   }
+  summary: "An op to send a tensor to the host."
+  description: <<END
+input: the tensor that will be sent to the host.
+Tinput: element type for input.
+key: A unique identifier for this region used to match up host transfers.
+END
 }
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index a2b9867f132..73c1458eab4 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -13,9 +13,6 @@ load(
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
@@ -1912,9 +1909,9 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "ring_reducer_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "ring_reducer_test.cc",
     ],
@@ -1927,7 +1924,6 @@ tf_cc_tests_gpu(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
@@ -1935,14 +1931,13 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/util:protos_test_cc",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "ring_gatherer_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "ring_gatherer_test.cc",
     ],
@@ -1962,15 +1957,13 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/util:protos_test_cc",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "hierarchical_tree_broadcaster_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "hierarchical_tree_broadcaster_test.cc",
     ],
@@ -1990,20 +1983,18 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/util:protos_test_cc",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_tests_gpu(
+tf_cuda_cc_test(
     name = "permuter_test",
-    size = "medium",
+    size = "small",
     srcs = [
         "permuter_test.cc",
     ],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = ["notap"],  # b/163417734
+    tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":core",
         ":core_cpu",
@@ -2018,8 +2009,6 @@ tf_cc_tests_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/util:protos_test_cc",
         "@com_google_absl//absl/memory",
     ],
 )
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 754f8196d29..a6629286698 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -279,7 +279,7 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   // Run on an unbounded work queue that can handle blocking work so as to not
   // starve executor threads.
   col_impl->Ref();
-  remote_access_->RunClosure([col_impl, col_ctx, done_safe, ctx]() {
+  RunClosure([col_impl, col_ctx, done_safe, ctx]() {
     core::ScopedUnref unref(col_impl);
     profiler::TraceMe activity(
         [ctx] {
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 8c579856d7d..c9cea393378 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 class CollectiveImplementation;
@@ -95,14 +96,15 @@ CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
 class BaseCollectiveExecutor : public CollectiveExecutor {
  public:
   BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
-                         PerStepCollectiveRemoteAccess* remote_access,
-                         int64 step_id, const DeviceMgr* dev_mgr,
-                         const string* gpu_ring_order)
+                         CollectiveRemoteAccess* remote_access, int64 step_id,
+                         const DeviceMgr* dev_mgr, const string* gpu_ring_order,
+                         std::shared_ptr<UnboundedWorkQueue> work_queue)
       : CollectiveExecutor(cem),
         step_id_(step_id),
         dev_mgr_(dev_mgr),
         remote_access_(remote_access),
-        gpu_ring_order_(gpu_ring_order) {}
+        gpu_ring_order_(gpu_ring_order),
+        work_queue_(std::move(work_queue)) {}
 
   ~BaseCollectiveExecutor() override;
 
@@ -115,35 +117,12 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                            CancellationManager* cancel_mgr,
                            StatusCallback done) override;
 
-  PerStepCollectiveRemoteAccess* remote_access() override {
+  CollectiveRemoteAccess* remote_access() override {
     return remote_access_.get();
   }
 
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
-                    DeviceContext* to_device_ctx,
-                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-                    const DeviceLocality& client_locality, int stream_index,
-                    const StatusCallback& done) override {
-    remote_access_->RecvFromPeer(
-        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
-        to_alloc_attr, to_tensor, client_locality, stream_index, done);
-  }
-
-  void PostToPeer(const string& peer_device, const string& peer_task,
-                  const string& key, Device* from_device,
-                  DeviceContext* from_device_ctx,
-                  const AllocatorAttributes& from_alloc_attr,
-                  const Tensor* from_tensor,
-                  const DeviceLocality& client_locality,
-                  const StatusCallback& done) override {
-    remote_access_->PostToPeer(peer_device, peer_task, key, from_device,
-                               from_device_ctx, from_alloc_attr, from_tensor,
-                               client_locality, done);
-  }
-
   void RunClosure(std::function<void()> closure) override {
-    remote_access_->RunClosure(std::move(closure));
+    work_queue_->Schedule(std::move(closure));
   }
 
   // If we need to enforce an ordering on any portion of collective
@@ -159,8 +138,11 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
  protected:
   const int64 step_id_;
   const DeviceMgr* dev_mgr_;  // Not owned.
-  std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
+  std::unique_ptr<CollectiveRemoteAccess> remote_access_;
   const string* gpu_ring_order_;  // Not owned.
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   mutex launch_mu_;
   condition_variable launch_cv_;
   // collective instance key -> number of local devices for which NCCL ops have
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index e9e0082195d..fb49e0cd761 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -58,10 +58,10 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
 }
 
 CollectiveExecutor* CollectiveExecutorMgr::Create(int64 step_id) {
-  CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
-      dev_mgr_, dev_resolver_.get(), work_queue_, step_id);
+  CollectiveRemoteAccessLocal* rma =
+      new CollectiveRemoteAccessLocal(dev_mgr_, dev_resolver_.get(), step_id);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
-                                    &gpu_ring_order_);
+                                    &gpu_ring_order_, work_queue_);
 }
 
 void CollectiveExecutorMgr::Cleanup(int64 step_id) {
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 4cd9f820c2e..ec875d031b2 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -108,6 +108,13 @@ void CollectiveRemoteAccessLocal::PostToPeer(
                              from_alloc_attr, done);
 }
 
+void CollectiveRemoteAccessLocal::CheckPeerHealth(const string& peer_task,
+                                                  const StatusCallback& done) {
+  // Assume local devices are always healthy.
+  done(errors::Internal(
+      "CheckPeerHealth is not supposed to be called for local collectives"));
+}
+
 /*static*/
 void CollectiveRemoteAccessLocal::MemCpyAsync(
     DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index b5d02f4d2bd..12aca901054 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -19,20 +19,17 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/rendezvous.h"
-#include "tensorflow/core/platform/unbounded_work_queue.h"
 
 namespace tensorflow {
 
 // Basic implementation of PerStepCollectiveRemoteAccess.
-class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
+class CollectiveRemoteAccessLocal : public CollectiveRemoteAccess {
  public:
   CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
                               DeviceResolverInterface* dev_resolver,
-                              std::shared_ptr<UnboundedWorkQueue> work_queue,
                               int64 step_id)
       : dev_mgr_(dev_mgr),
         dev_resolver_(dev_resolver),
-        work_queue_(std::move(work_queue)),
         buf_rendezvous_(step_id, dev_mgr),
         step_id_(step_id) {}
 
@@ -56,29 +53,8 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
                   const DeviceLocality& client_locality,
                   const StatusCallback& done) override;
 
-  void RunClosure(std::function<void()> closure) override {
-    work_queue_->Schedule(std::move(closure));
-  }
-
-  void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
-                                   const std::vector<string>& tasks,
-                                   std::vector<DeviceAttributes>* attributes,
-                                   const StatusCallback& done) override {
-    dev_resolver_->GetAllDeviceAttributesAsync(devices, tasks, attributes,
-                                               done);
-  }
-
-  void GetDeviceAttributesAsync(const string& device, const string& task,
-                                DeviceAttributes* attributes,
-                                const StatusCallback& done) override {
-    dev_resolver_->GetDeviceAttributesAsync(device, task, attributes, done);
-  }
-
-  void ClearTask(const string& task) override {
-    dev_resolver_->ClearTask(task);
-  }
-
-  void ClearCache() override { dev_resolver_->ClearCache(); }
+  void CheckPeerHealth(const string& peer_task,
+                       const StatusCallback& done) override;
 
   BufRendezvous* buf_rendezvous() override { return &buf_rendezvous_; }
 
@@ -96,9 +72,6 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
  protected:
   const DeviceMgr* dev_mgr_;               // not owned
   DeviceResolverInterface* dev_resolver_;  // not owned
-  // Ownership of `work_queue_` is shared between `this` and
-  // `CollectiveExecutorMgr`.
-  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   BufRendezvous buf_rendezvous_;
   int64 step_id_;
 };
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index b7b85e3de66..2c606147f7d 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -50,8 +50,8 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     drl_ = absl::make_unique<DeviceResolverLocal>(device_mgr_.get());
     prl_ = absl::make_unique<CollectiveParamResolverLocal>(
         cp, device_mgr_.get(), drl_.get(), kTaskName);
-    rma_ = absl::make_unique<CollectiveRemoteAccessLocal>(
-        device_mgr_.get(), drl_.get(), work_queue_, kStepId);
+    rma_ = absl::make_unique<CollectiveRemoteAccessLocal>(device_mgr_.get(),
+                                                          drl_.get(), kStepId);
   }
 
   ~CollectiveRemoteAccessLocalTest() override = default;
@@ -151,5 +151,16 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
 }
 
+TEST_F(CollectiveRemoteAccessLocalTest, CheckHealth) {
+  Status status;
+  Notification done;
+  rma_->CheckPeerHealth(kTaskName, [&status, &done](const Status& s) {
+    status = s;
+    done.Notify();
+  });
+  done.WaitForNotification();
+  EXPECT_TRUE(errors::IsInternal(status));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
index 12e1e28296d..9a898e72931 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -46,4 +46,10 @@ void DeviceResolverLocal::GetDeviceAttributesAsync(const string& device,
   done(s);
 }
 
+Status DeviceResolverLocal::GetTaskCached(
+    const string& task, std::vector<DeviceAttributes>* attributes) {
+  return errors::Internal(
+      "GetTaskCached is not supposed to be called in local collectives");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
index 53a3c87a158..12b7dce8ab1 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.h
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -39,6 +39,9 @@ class DeviceResolverLocal : public DeviceResolverInterface {
                                 DeviceAttributes* attributes,
                                 const StatusCallback& done) override;
 
+  Status GetTaskCached(const string& task,
+                       std::vector<DeviceAttributes>* attributes) override;
+
   void ClearTask(const string& task) override {}
 
   void ClearCache() override {}
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index a1bbcde94bd..3a49f6f3232 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -183,7 +183,7 @@ class DirectSessionFactory : public SessionFactory {
 
     // Must do this before the CPU allocator is created.
     if (options.config.graph_options().build_cost_model() > 0) {
-      EnableCPUAllocatorFullStats(true);
+      EnableCPUAllocatorFullStats();
     }
     std::vector<std::unique_ptr<Device>> devices;
     TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 93b78ed6a26..196c4635ac4 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -170,24 +170,15 @@ AbstractTensorInterface* EagerContext::CreateTensor(
 
 AbstractTensorInterface* EagerContext::CreateTensor(
     DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
-    bool convert_string, MemoryReleaser memory_releaser,
-    void* memory_releaser_arg) {
+    MemoryReleaser memory_releaser, void* memory_releaser_arg) {
   TF_Tensor* tensor_wrapper =
       TF_NewTensor(static_cast<TF_DataType>(dtype), dims, num_dims, data, len,
                    memory_releaser, memory_releaser_arg);
 
-  if (convert_string) {
-    tensorflow::Tensor tensor;
-    Status status = TF_TensorToTensor(tensor_wrapper, &tensor);
-    TF_DeleteTensor(tensor_wrapper);
-    if (!status.ok()) return nullptr;
-    return new TensorInterface(std::move(tensor));
-  } else {
-    AbstractTensorInterface* result = nullptr;
-    std::swap(result, tensor_wrapper->tensor);
-    TF_DeleteTensor(tensor_wrapper);
-    return result;
-  }
+  AbstractTensorInterface* result = nullptr;
+  std::swap(result, tensor_wrapper->tensor);
+  TF_DeleteTensor(tensor_wrapper);
+  return result;
 }
 
 void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
@@ -671,7 +662,8 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
-        request.get(), response, [request, response](const Status& status) {
+        /*call_opts=*/nullptr, request.get(), response,
+        [request, response](const Status& status) {
           if (!status.ok()) {
             LOG(ERROR) << "Failed to register function remotely due to "
                        << status.error_message()
@@ -714,7 +706,7 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
     for (int i = 0; i < requests.size(); i++) {
       auto response = std::make_shared<eager::EnqueueResponse>();
       eager_client->StreamingEnqueueAsync(
-          requests[i].get(), response.get(),
+          /*call_opts=*/nullptr, requests[i].get(), response.get(),
           [request = requests[i], response](const Status& s) {
             if (!s.ok()) {
               LOG(ERROR) << "Failed to register function remotely due to "
@@ -825,7 +817,7 @@ Status EagerContext::SyncExecutors() {
 
     eager::EnqueueResponse* response = new eager::EnqueueResponse();
     eager_client->StreamingEnqueueAsync(
-        &request, response,
+        /*call_opts=*/nullptr, &request, response,
         [response, target, &counter, &s = statuses[i]](const Status& status) {
           s = status;
           delete response;
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 286eb44fbeb..0daee4139fc 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -174,7 +174,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       DataType dtype, absl::Span<const int64> dim_sizes) override;
   AbstractTensorInterface* CreateTensor(DataType dtype, const int64_t* dims,
                                         int num_dims, void* data, size_t len,
-                                        bool convert_string,
                                         MemoryReleaser memory_releaser,
                                         void* memory_releaser_arg) override;
 
@@ -459,6 +458,11 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   tensorflow::ServerInterface* GetServer() { return server_.get(); }
 
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kEager;
+  }
+
 #endif  // IS_MOBILE_PLATFORM
 
   // Closes remote eager contexts, waits for all RPCs to finish, and
@@ -660,11 +664,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
 
-  // For LLVM style RTTI.
-  static bool classof(const AbstractContext* ptr) {
-    return ptr->getKind() == kEager;
-  }
-
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
   // be).
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index 43daf37f6b2..ff63c70a98f 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -191,6 +191,9 @@ Status EagerContext::RegisterFunction(AbstractFunction* f) {
 // eager_operation.cc we can avoid a circular dependency between them.
 Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
                                int* num_retvals) {
+  for (int i = 0; i < Inputs().size(); ++i) {
+    TF_RETURN_IF_ERROR(Inputs()[i]->WaitUnknownDevice());
+  }
   // Run eager placement logic.
   VariantDevice device;
   TF_RETURN_IF_ERROR(eager::MaybePinToCustomDevice(&device, *this));
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 4bffd887750..24582147479 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -551,8 +551,9 @@ Status GetOrCreateKernelAndDevice(
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU()));
     }
 
-    TF_RETURN_IF_ERROR(
-        kernel->Init({ctx.LogDevicePlacement()}, ndef, graph_collector));
+    TF_RETURN_IF_ERROR(kernel->Init(
+        {ctx.LogDevicePlacement(), ctx.LazyCopyFunctionRemoteInputs()}, ndef,
+        graph_collector));
 
     if (op->is_function()) {
       ctx.AddKernelToCache(cache_key, kernel.get());
@@ -607,8 +608,14 @@ Status CreateUnshapedOutput(
         "Unable to find remote task corresponding to device ",
         output_device->name());
   }
-  *output = TensorHandle::CreateUnshapedRemoteHandle(
-      op_id, output_num, remote_task, output_dtype, output_device, ctx);
+  if (ctx->RemoteMgr()->IsMaster()) {
+    *output = TensorHandle::CreateUnshapedRemoteHandle(
+        op_id, output_num, remote_task, output_dtype, output_device, ctx);
+  } else {
+    *output = TensorHandle::CreateLazyRemoteHandle(op_id, output_num,
+                                                   output_dtype, output_device,
+                                                   /*is_ready=*/false, ctx);
+  }
   return Status::OK();
 #endif  // !IS_MOBILE_PLATFORM
 }
@@ -916,14 +923,15 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     // execute.
 
     // The device_ and resource_device_ of this TensorHandle might be
-    // incorrect. It is pretty hard to make it correct because for
-    // multi-device functions, we don't know the output device until the
-    // function is instantiated. Luckily, we don't need to know the correct
-    // remote device here. We just need to know that it is remote. If we need
-    // to copy this tensor to this process, the remote end will know the
-    // correct device of this handle.
+    // incorrect. For multi-device functions, we don't know the output device
+    // until the function is instantiated on a remote worker. Luckily, we don't
+    // need to know the correct remote device here. We just need to know that it
+    // is remote. If we need copy this tensor to this process or run any ops
+    // which take this tensor as an input, block until the correct device is
+    // set.
+    const bool unknown_device = op->is_function();
     retvals[i] = TensorHandle::CreateUnshapedRemoteHandle(
-        id, i, remote_task, output_dtypes[i], op_device, &ctx);
+        id, i, remote_task, output_dtypes[i], op_device, &ctx, unknown_device);
   }
 
   if (ctx.LazyCopyFunctionRemoteInputs()) {
@@ -946,7 +954,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   std::unique_ptr<EagerNode> node(new eager::RemoteExecuteNode(
       &op->EagerContext(), std::move(request), op_device,
-      ctx.GetContextViewId(), eager_client.get(),
+      ctx.GetContextViewId(), eager_client.get(), op->GetCancellationManager(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
 
@@ -1206,6 +1214,7 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
                          EagerExecutor* executor, Device* device, bool mirror,
                          TensorHandle** result) {
+  TF_RETURN_IF_ERROR(h->WaitUnknownDevice());
   auto send_device = h->DeviceOrHostCPU(*ctx);
   if (VariantDeviceIsCustom(send_device)) {
     return errors::Unimplemented(
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 83fbcf5017e..54df63f0f73 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -94,9 +94,9 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
       TensorHandle::CreateLocalHandle(std::move(t1), device0, device0, ctx);
   // Create two remote TensorHandles
   TensorHandle* h2 = TensorHandle::CreateLazyRemoteHandle(
-      /*op_id=*/1, /*output_num=*/0, dtype, device1, ctx);
+      /*op_id=*/1, /*output_num=*/0, dtype, device1, /*is_ready=*/true, ctx);
   TensorHandle* h3 = TensorHandle::CreateLazyRemoteHandle(
-      /*op_id=*/2, /*output_num=*/1, dtype, device1, ctx);
+      /*op_id=*/2, /*output_num=*/1, dtype, device1, /*is_ready=*/true, ctx);
   // Create a packed TensorHandle
   TensorHandle* packed_h = nullptr;
   TF_ASSERT_OK(TensorHandle::CreatePackedHandle({h1, h2}, ctx, &packed_h));
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 00d832365e9..5f0dce21e8e 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -223,7 +223,8 @@ Status KernelAndDeviceFunc::InstantiateFunc(const Context& ctx,
 Status KernelAndDeviceFunc::Init(const Context& ctx, const NodeDef& ndef,
                                  GraphCollector* graph_collector) {
   TF_RETURN_IF_ERROR(InstantiateFunc(ctx, ndef, graph_collector));
-  return pflr_->GetOutputDevices(handle_, &output_devices_);
+  return pflr_->GetOutputDevices(handle_, &output_devices_,
+                                 ctx.eager_lazy_copy);
 }
 
 namespace {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 7bf4afbaf24..0a765510d7b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -97,6 +97,7 @@ class KernelAndDevice : public core::RefCounted {
  public:
   struct Context {
     bool log_device_placement = false;
+    bool eager_lazy_copy = false;
   };
 
   // Populates this with a kernel appropriate for 'ndef'.
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index df4423de392..105b0d92460 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -150,7 +150,7 @@ Status MklEagerOpRewrite::CreateGenericMklOp(
 Status MklEagerOpRewrite::CreateMklConv2DOp(
     EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op) {
   const string mkl_op_name =
-      mkl_op_registry::GetMklEagerOpName(orig_op->Name());
+      mkl_op_registry::GetMklNativeOpName(orig_op->Name());
   TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_conv2d_op));
   return Status::OK();
 }
@@ -210,7 +210,7 @@ bool MklEagerOpRewrite::SlowCheckIfKernelRegistered(string op_name,
   if (element != mkl_eager_ops_.end()) {
     // Eager Op exists. So verify registry and return registered or not.
     return (mkl_op_registry::IsMklNameChangeOp(
-                mkl_op_registry::GetMklEagerOpName(op_name), dt) ||
+                mkl_op_registry::GetMklNativeOpName(op_name), dt) ||
             mkl_op_registry::IsMklNameChangeOp(
                 mkl_op_registry::GetMklOpName(op_name), dt));
   } else {
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
index bcae4e6765f..4a4502e8f33 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
@@ -74,7 +74,7 @@ class EagerOpRewriteTest : public ::testing::Test {
     auto orig_op = CreateOp("Conv2D");                \
     orig_op->MutableAttrs()->Set("T", T);             \
     orig_op->MutableAttrs()->Set("padding", "VALID"); \
-    CheckRewrite(orig_op.get(), "_MklEagerConv2D");   \
+    CheckRewrite(orig_op.get(), "_MklNativeConv2D");  \
   }
 REGISTER_TEST_ALL_TYPES(Conv2D);
 #undef REGISTER_TEST
@@ -89,22 +89,22 @@ REGISTER_TEST_ALL_TYPES(Conv2D);
 REGISTER_TEST_ALL_TYPES(Conv2D_Explicit_Padding);
 #undef REGISTER_TEST
 
-#define REGISTER_TEST(NAME, T, INPUT)                            \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {                       \
-    auto orig_op = CreateOp("Conv2DBackpropInput");              \
-    orig_op->MutableAttrs()->Set("T", T);                        \
-    orig_op->MutableAttrs()->Set("padding", "VALID");            \
-    CheckRewrite(orig_op.get(), "_MklEagerConv2DBackpropInput"); \
+#define REGISTER_TEST(NAME, T, INPUT)                             \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                        \
+    auto orig_op = CreateOp("Conv2DBackpropInput");               \
+    orig_op->MutableAttrs()->Set("T", T);                         \
+    orig_op->MutableAttrs()->Set("padding", "VALID");             \
+    CheckRewrite(orig_op.get(), "_MklNativeConv2DBackpropInput"); \
   }
 REGISTER_TEST_ALL_TYPES(Conv2DBackpropInput);
 #undef REGISTER_TEST
 
-#define REGISTER_TEST(NAME, T, INPUT)                             \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {                        \
-    auto orig_op = CreateOp("Conv2DBackpropFilter");              \
-    orig_op->MutableAttrs()->Set("T", T);                         \
-    orig_op->MutableAttrs()->Set("padding", "VALID");             \
-    CheckRewrite(orig_op.get(), "_MklEagerConv2DBackpropFilter"); \
+#define REGISTER_TEST(NAME, T, INPUT)                              \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                         \
+    auto orig_op = CreateOp("Conv2DBackpropFilter");               \
+    orig_op->MutableAttrs()->Set("T", T);                          \
+    orig_op->MutableAttrs()->Set("padding", "VALID");              \
+    CheckRewrite(orig_op.get(), "_MklNativeConv2DBackpropFilter"); \
   }
 REGISTER_TEST_ALL_TYPES(Conv2DBackpropFilter);
 #undef REGISTER_TEST
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d7b2ef4be1e..620685ea3c1 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -115,6 +115,20 @@ bool TensorHandle::PackedTensorHandleData::IsReady() const {
   return true;
 }
 
+Status TensorHandle::PackedTensorHandleData::WaitReady(
+    const char* caller) const {
+  {
+    tf_shared_lock l(mu_);
+    if (!is_poisoned_.ok()) {
+      return is_poisoned_;
+    }
+  }
+  for (auto* handle : handles_) {
+    TF_RETURN_IF_ERROR(handle->WaitReady(caller));
+  }
+  return Status::OK();
+}
+
 void TensorHandle::PackedTensorHandleData::Poison(Status status) {
   mutex_lock l(mu_);
   is_poisoned_ = status;
@@ -370,14 +384,16 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
 #if !defined(IS_MOBILE_PLATFORM)
 TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
     int64 op_id, int32 output_num, const string& remote_task,
-    tensorflow::DataType dtype, Device* d, EagerContext* ctx) {
-  return new TensorHandle(op_id, output_num, remote_task, dtype, d, ctx);
+    tensorflow::DataType dtype, Device* d, EagerContext* ctx,
+    const bool unknown_device) {
+  return new TensorHandle(op_id, output_num, remote_task, dtype, d, ctx,
+                          unknown_device);
 }
 
 TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                            const string& remote_task,
                            tensorflow::DataType dtype, Device* d,
-                           EagerContext* ctx)
+                           EagerContext* ctx, const bool unknown_device)
     : ImmediateExecutionTensorHandle(kEager),
       dtype(dtype),
       device_(d),
@@ -385,6 +401,7 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
       resource_remote_device_incarnation_(
           GetRemoteDeviceIncarnation(resource_device_)),
+      unknown_device_(unknown_device),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             remote_task, ctx) {
@@ -392,17 +409,15 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
            << " device: " << VariantDeviceDebugString(device_);
 }
 
-TensorHandle* TensorHandle::CreateLazyRemoteHandle(int64 op_id,
-                                                   int32 output_num,
-                                                   tensorflow::DataType dtype,
-                                                   Device* d,
-                                                   EagerContext* ctx) {
-  return new TensorHandle(op_id, output_num, dtype, d, ctx);
+TensorHandle* TensorHandle::CreateLazyRemoteHandle(
+    int64 op_id, int32 output_num, tensorflow::DataType dtype, Device* d,
+    const bool is_ready, EagerContext* ctx) {
+  return new TensorHandle(op_id, output_num, dtype, d, is_ready, ctx);
 }
 
 TensorHandle::TensorHandle(int64 op_id, int32 output_num,
                            tensorflow::DataType dtype, Device* d,
-                           EagerContext* ctx)
+                           const bool is_ready, EagerContext* ctx)
     : ImmediateExecutionTensorHandle(kEager),
       dtype(dtype),
       device_(d),
@@ -412,7 +427,7 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
           GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
-            ctx->GetContextViewId()) {
+            ctx->GetContextViewId(), is_ready) {
   DVLOG(3) << "Creating Lazy Remote TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_);
 }
@@ -431,6 +446,11 @@ bool TensorHandle::IsReady() const {
   return absl::visit([](auto& data) { return data.IsReady(); }, data_);
 }
 
+Status TensorHandle::WaitReady(const char* caller) const {
+  return absl::visit([caller](auto& data) { return data.WaitReady(caller); },
+                     data_);
+}
+
 TensorHandle::HandleType TensorHandle::Type() const {
   if (data_.index() == 0) {
     return LOCAL;
@@ -518,6 +538,17 @@ Status TensorHandle::TensorValue(const Device* d, tensorflow::TensorValue* t) {
   return mirror.TensorValue(t);
 }
 
+Status TensorHandle::WaitUnknownDevice() const {
+  if (unknown_device_) {
+    TF_RETURN_IF_ERROR(absl::visit(
+        [](auto& data) {
+          return data.WaitReady("TensorHandle::UnknownDevice");
+        },
+        data_));
+  }
+  return Status::OK();
+}
+
 VariantDevice TensorHandle::DeviceOrHostCPU(const EagerContext& ctx) const {
   if (VariantDeviceIsCustom(device_)) {
     return device_;
@@ -786,13 +817,21 @@ Status TensorHandle::AddResourceShapeMirror(const Device* d, int64 op_id,
 
   resource_shape_mirrors_.emplace(
       std::piecewise_construct, std::forward_as_tuple(d->name()),
-      std::forward_as_tuple(op_id, output_num, ctx->GetContextViewId()));
+      std::forward_as_tuple(op_id, output_num, ctx->GetContextViewId(),
+                            /*is_ready=*/true));
 
   return Status::OK();
 }
 
 Status TensorHandle::SetRemoteShape(const TensorShape& shape, const Device* d,
                                     uint64 context_view_id) {
+  return SetRemoteShapeAndDevice(shape, d, context_view_id, /*op_device=*/"");
+}
+
+Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
+                                             const Device* d,
+                                             uint64 context_view_id,
+                                             string op_device) {
   DVLOG(3) << "SetRemoteShape on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -830,7 +869,27 @@ Status TensorHandle::SetRemoteShape(const TensorShape& shape, const Device* d,
   // For mirrors, this is not the case because they colocate with the data
   // consuming op/function device, and we (for now) have to aggressively
   // invalidate those copies to avoid any false positives during cluster update.
-  return data.SetShape(shape);
+  if (op_device.empty()) {
+    return data.SetShape(shape);
+  } else {
+    if (!unknown_device_) {
+      return errors::Internal("Cannot reset known devices.");
+    }
+    Device* device;
+    TF_RETURN_IF_ERROR(ctx_->FindDeviceFromName(op_device.c_str(), &device));
+    device_ = device;
+    op_device_ = device;
+    resource_device_ = dtype == DT_RESOURCE ? device : nullptr;
+    resource_remote_device_incarnation_ =
+        GetRemoteDeviceIncarnation(resource_device_);
+    string remote_task;
+    if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
+      return errors::InvalidArgument(
+          "Unable to find remote task corresponding to device ",
+          device->name());
+    }
+    return data.SetShapeAndRemoteTask(shape, remote_task);
+  }
 }
 
 void TensorHandle::PoisonRemote(Status status, const Device* d,
@@ -1040,6 +1099,7 @@ const char* TensorHandle::DeviceName(Status* status) const {
   if (VariantDeviceIsCustom(device())) {
     return absl::get<CustomDevice*>(device())->name().c_str();
   }
+  status->Update(WaitUnknownDevice());
   tensorflow::Device* d = op_device();
   return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                         : d->name().c_str();
@@ -1049,6 +1109,7 @@ const char* TensorHandle::BackingDeviceName(Status* status) const {
   if (VariantDeviceIsCustom(device())) {
     return absl::get<tensorflow::CustomDevice*>(device())->name().c_str();
   } else {
+    status->Update(WaitUnknownDevice());
     tensorflow::Device* d = absl::get<tensorflow::Device*>(device());
     return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                           : d->name().c_str();
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 99f88fe886a..eed31b79b0f 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -66,9 +66,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
 
 #if !defined(IS_MOBILE_PLATFORM)
   TensorHandle(int64 op_id, int32 output_num, const string& remote_task,
-               tensorflow::DataType dtype, Device* device, EagerContext* ctx);
+               tensorflow::DataType dtype, Device* device, EagerContext* ctx,
+               const bool unknown_device);
   TensorHandle(int64 op_id, int32 output_num, tensorflow::DataType dtype,
-               Device* device, EagerContext* ctx);
+               Device* device, const bool is_ready, EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
 
  public:
@@ -100,13 +101,21 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
                                    TensorHandle** packed_handle);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  static TensorHandle* CreateUnshapedRemoteHandle(int64 op_id, int32 output_num,
-                                                  const string& remote_task,
-                                                  tensorflow::DataType dtype,
-                                                  Device* d, EagerContext* ctx);
+  // An unshaped remote handle refers to a tensor on a remote worker. It's not
+  // ready until the shape is set. It controls the lifetime of the remote
+  // tensor.
+  static TensorHandle* CreateUnshapedRemoteHandle(
+      int64 op_id, int32 output_num, const string& remote_task,
+      tensorflow::DataType dtype, Device* d, EagerContext* ctx,
+      const bool unknown_device = false);
+  // A lazy remote handle refers to a tensor on a remote worker. The lifetime of
+  // the remote tensor is controlled by the remote worker, but not by the lazy
+  // remote handle. Lazy handles are normally created on a default function
+  // device.
   static TensorHandle* CreateLazyRemoteHandle(int64 op_id, int32 output_num,
                                               tensorflow::DataType dtype,
-                                              Device* d, EagerContext* ctx);
+                                              Device* d, const bool is_ready,
+                                              EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
 
   void Release() override;
@@ -141,6 +150,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     return resource_remote_device_incarnation_;
   }
 
+  // If the devices are unknown at creation time, block until the actual devices
+  // are set (data is ready).
+  Status WaitUnknownDevice() const;
+
   VariantDevice DeviceOrHostCPU(const EagerContext& ctx) const;
 
   Status Shape(tensorflow::TensorShape* shape);
@@ -177,10 +190,15 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // transitions the tensor handle from a non-ready to a ready state by
   // replacing the backing data abstraction to allow for the shape to be
   // queried.
+  // creating a TensorHandle (e.g. a remote output of a remote function).
   // This method or Poison must be called exactly once for remote tensors that
   // were created without a known shape.
   Status SetRemoteShape(const TensorShape& shape, const Device* d,
                         uint64 context_view_id);
+  // If op_device is not empty, reset the devices of a remote tensor which is
+  // created without known devices (e.g. function outputs).
+  Status SetRemoteShapeAndDevice(const TensorShape& shape, const Device* d,
+                                 uint64 context_view_id, string op_device);
 
   // Poisons either this handle or a remote mirror with error `status`.
   // Poisoning means that the handle will become ready and methods trying
@@ -258,21 +276,27 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // to either SetTensor or SetRemoteShape which replaces the underlying data
   // with a ready version of the tensor handle data.
   bool IsReady() const;
+  Status WaitReady(const char* caller) const;
 
-  VariantDevice const device_;
+  VariantDevice device_;
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
   // Can be nullptr if the op producing this tensor was a function executed
   // with function library runtime.
-  tensorflow::Device* const op_device_;
+  tensorflow::Device* op_device_;
 
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
   // backing the resource. Else resource_device_ is nullptr.
-  tensorflow::Device* const resource_device_;
+  tensorflow::Device* resource_device_;
   // Incarnation ID of the resource device if it locates on a remote device, or
   // 0 if it locates on a local device.
-  const int64 resource_remote_device_incarnation_;
+  int64 resource_remote_device_incarnation_;
+
+  // If true, the handle refers to a remote tensor which is created without
+  // known devices. The actual devices are set by SetRemoteShape. The devices
+  // should be accessed once the handle is ready.
+  const bool unknown_device_ = false;
 
   mutable mutex mu_;
 
@@ -323,6 +347,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     Status NumElements(int64* num_elements) const;
     Status Unprotect();
     bool IsReady() const;
+    Status WaitReady(const char* caller) const;
     void Poison(Status status);
     string DebugString() const;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 40cec3fcc49..6b3c464f674 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -334,4 +334,84 @@ TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
   ctx->Unref();
 }
 
+class RemoteTensorHandleTest : public ::testing::Test {
+ public:
+  RemoteTensorHandleTest() {
+    std::vector<std::unique_ptr<Device>> devices;
+    for (const char* name : device_names_) {
+      devices.emplace_back(CreateDevice("CPU", name));
+    }
+    device_mgr_ = new StaticDeviceMgr(std::move(devices));
+
+    context_ = new EagerContext(
+        SessionOptions(),
+        tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+        /* lazy_copy_function_remote_inputs= */ false, device_mgr_,
+        /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+        /* custom_kernel_creator= */ nullptr,
+        /* cluster_flr= */ nullptr);
+  }
+
+  ~RemoteTensorHandleTest() override {
+    delete device_mgr_;
+    context_->Unref();
+  }
+
+  EagerContext* context() { return context_; }
+
+  std::vector<Device*> ListDevices() const {
+    return device_mgr_->ListDevices();
+  }
+
+ private:
+  const std::vector<const char*> device_names_ = {
+      "/job:worker/replica:0/task:0/device:CPU:0",
+      "/job:worker/replica:0/task:1/device:CPU:0",
+      "/job:worker/replica:0/task:2/device:CPU:0"};
+
+  StaticDeviceMgr* device_mgr_;
+  EagerContext* context_;
+};
+
+TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+      /* lazy_copy_function_remote_inputs= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* custom_kernel_creator= */ nullptr,
+      /* cluster_flr= */ nullptr);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  const string remote_task = "/job:worker/replica:0/task:1";
+  Device* d1 = device_mgr.ListDevices().at(1);
+  TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
+      /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
+      /*unknown_device=*/true);
+  EXPECT_EQ(absl::get<Device*>(h->device()), d1);
+
+  Device* d2 = device_mgr.ListDevices().at(2);
+  TF_ASSERT_OK(h->SetRemoteShapeAndDevice(
+      shape, d1, context->GetContextViewId(), d2->name()));
+  Status s;
+  EXPECT_EQ(h->BackingDeviceName(&s), d2->name());
+  TF_EXPECT_OK(s);
+  EXPECT_EQ(absl::get<Device*>(h->device()), d2);
+  h->Unref();
+  context->Unref();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index decf8b2ccb5..ea38349d61c 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -419,12 +419,12 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
           << col_ctx_->device_name << " to_device "
           << col_params_->instance.device_names[dst_idx] << " subdiv=" << subdiv
           << " dst_rank=" << dst_rank << " dst_idx=" << dst_idx;
-  col_ctx_->col_exec->PostToPeer(col_params_->instance.device_names[dst_idx],
-                                 col_params_->instance.task_names[dst_idx],
-                                 send_buf_key, col_ctx_->device,
-                                 col_ctx_->op_ctx->op_device_context(),
-                                 col_ctx_->op_ctx->output_alloc_attr(0),
-                                 src_tensor, col_ctx_->device_locality, done);
+  col_ctx_->col_exec->remote_access()->PostToPeer(
+      col_params_->instance.device_names[dst_idx],
+      col_params_->instance.task_names[dst_idx], send_buf_key, col_ctx_->device,
+      col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), src_tensor,
+      col_ctx_->device_locality, done);
 }
 
 void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
@@ -438,7 +438,7 @@ void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
           << col_params_->instance.device_names[src_idx] << " to_device "
           << col_ctx_->device_name << " subdiv=" << subdiv
           << " src_rank=" << src_rank << " src_idx=" << src_idx;
-  col_ctx_->col_exec->RecvFromPeer(
+  col_ctx_->col_exec->remote_access()->RecvFromPeer(
       col_params_->instance.device_names[src_idx],
       col_params_->instance.task_names[src_idx],
       col_params_->task.is_local[src_idx], recv_buf_key, col_ctx_->device,
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 1a98a9adbb8..e2fb371bd23 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -137,9 +137,8 @@ DEF_TL_TEST(8, 7, 7, -1, V(0, 1))
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -253,10 +252,11 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     col_params_.instance.data_type = dtype;
     static const int kGroupKey = 6;
diff --git a/tensorflow/core/common_runtime/permuter.cc b/tensorflow/core/common_runtime/permuter.cc
index c3081d6bc61..61b8dcb79c8 100644
--- a/tensorflow/core/common_runtime/permuter.cc
+++ b/tensorflow/core/common_runtime/permuter.cc
@@ -39,17 +39,14 @@ namespace tensorflow {
 Permuter::Permuter()
     : col_ctx_(nullptr), col_params_(nullptr), done_(nullptr), counter_(0) {}
 
-bool Permuter::CheckCounter() {
-  mutex_lock lock(mu_counter_);
-  ++counter_;
-  if (counter_ == 2) return true;
-  return false;
-}
-
-StatusCallback Permuter::HalfDone() {
+StatusCallback Permuter::CheckCounterAndCallDone() {
   return [this](const Status& s) {
+    mu_.lock();
     status_.Update(s);
-    if (CheckCounter()) done_(status_);
+    int counter = ++counter_;
+    Status status = status_;
+    mu_.unlock();
+    if (counter == 2) done_(status);
   };
 }
 
@@ -64,17 +61,18 @@ Status Permuter::InitializeCollectiveContext(
 }
 
 void Permuter::Run(StatusCallback done) {
+  if (col_params_->instance.permutation.size() !=
+      col_params_->instance.devices.size()) {
+    done(errors::Internal("Permutation must be the same size as devices"));
+  }
   done_ = std::move(done);
-  for (int i = 0; i < col_params_->instance.devices.size(); ++i) {
-    if (col_ctx_->device_name == col_params_->instance.devices[i]) {
-      DispatchSend(i, col_params_->instance.permutation[i], col_ctx_->input,
-                   HalfDone());
-      continue;
-    }
-    if (col_ctx_->device_name ==
-        col_params_->instance.devices[col_params_->instance.permutation[i]]) {
+  DispatchSend(col_params_->default_rank,
+               col_params_->instance.permutation[col_params_->default_rank],
+               col_ctx_->input, CheckCounterAndCallDone());
+  for (int i = 0; i < col_params_->instance.permutation.size(); ++i) {
+    if (col_params_->default_rank == col_params_->instance.permutation[i]) {
       DispatchRecv(i, col_params_->instance.permutation[i], col_ctx_->output,
-                   HalfDone());
+                   CheckCounterAndCallDone());
     }
   }
 }
@@ -87,12 +85,12 @@ void Permuter::DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
           << col_ctx_->device_name << " to_device "
           << col_params_->instance.devices[target_rank]
           << " target_rank=" << target_rank << " src_rank=" << src_rank;
-  col_ctx_->col_exec->PostToPeer(col_params_->instance.devices[target_rank],
-                                 col_params_->instance.task_names[target_rank],
-                                 send_buf_key, col_ctx_->device,
-                                 col_ctx_->op_ctx->op_device_context(),
-                                 col_ctx_->op_ctx->output_alloc_attr(0), tensor,
-                                 col_ctx_->device_locality, done);
+  col_ctx_->col_exec->remote_access()->PostToPeer(
+      col_params_->instance.devices[target_rank],
+      col_params_->instance.task_names[target_rank], send_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
+      done);
 }
 
 void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
@@ -103,13 +101,13 @@ void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
           << col_ctx_->device_name << " from_device "
           << col_params_->instance.devices[src_rank]
           << " target_rank=" << target_rank << " src_rank=" << src_rank;
-  col_ctx_->col_exec->RecvFromPeer(col_params_->instance.devices[src_rank],
-                                   col_params_->instance.task_names[src_rank],
-                                   col_params_->task.is_local[src_rank],
-                                   recv_buf_key, col_ctx_->device,
-                                   col_ctx_->op_ctx->op_device_context(),
-                                   col_ctx_->op_ctx->output_alloc_attr(0),
-                                   tensor, col_ctx_->device_locality, 0, done);
+  col_ctx_->col_exec->remote_access()->RecvFromPeer(
+      col_params_->instance.devices[src_rank],
+      col_params_->instance.task_names[src_rank],
+      col_params_->task.is_local[src_rank], recv_buf_key, col_ctx_->device,
+      col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
+      0, done);
 }
 namespace {
 REGISTER_COLLECTIVE(Permute, Permuter);
diff --git a/tensorflow/core/common_runtime/permuter.h b/tensorflow/core/common_runtime/permuter.h
index 245168b4b0d..a99b8489630 100644
--- a/tensorflow/core/common_runtime/permuter.h
+++ b/tensorflow/core/common_runtime/permuter.h
@@ -67,9 +67,9 @@ class Permuter : public CollectiveImplementationInterface {
   std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
   StatusCallback done_;
-  Status status_;
-  mutex mu_counter_;
-  int counter_ TF_GUARDED_BY(mu_counter_);
+  mutex mu_;
+  Status status_ TF_GUARDED_BY(mu_);
+  int counter_ TF_GUARDED_BY(mu_);
 
   void DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
                     const StatusCallback& done);
@@ -77,12 +77,10 @@ class Permuter : public CollectiveImplementationInterface {
   void DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
                     const StatusCallback& done);
 
-  // Checks if counter_ reaches 2.
   // Atomically increments counter_ by one for sending, one for receiving.
-  // The purpose of this check is to ensure that done_ is called only once.
-  bool CheckCounter();
-
-  StatusCallback HalfDone();
+  // Invokes done when counter_ reaches 2.
+  // The purpose of checking counter_ is to ensure that done_ is called once.
+  StatusCallback CheckCounterAndCallDone();
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/permuter_test.cc b/tensorflow/core/common_runtime/permuter_test.cc
index a5117322ffa..fd219aa3c17 100644
--- a/tensorflow/core/common_runtime/permuter_test.cc
+++ b/tensorflow/core/common_runtime/permuter_test.cc
@@ -49,9 +49,8 @@ static int64 kStepId = 123;
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -162,10 +161,11 @@ class PermuterTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     col_params_.instance.data_type = dtype;
     static const int kInstanceKey = 18;
@@ -294,6 +294,7 @@ class PermuterTest : public ::testing::Test {
                              actual.template flat<T>()(i))
                 << "Mismatch at device " << di << " index " << i;
             break;
+          case DT_BOOL:
           case DT_INT32:
           case DT_INT64:
             EXPECT_EQ(expected[(di * tensor_len) + i],
@@ -443,6 +444,9 @@ class PermuterTest : public ::testing::Test {
          DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
     DataType dtype = DT_##B;                                                  \
     switch (dtype) {                                                          \
+      case DT_BOOL: {                                                         \
+        RunTest<bool>(dtype, DEVICE_##T, W, D, L, A);                         \
+      } break;                                                                \
       case DT_FLOAT: {                                                        \
         RunTest<float>(dtype, DEVICE_##T, W, D, L, A);                        \
       } break;                                                                \
@@ -472,6 +476,10 @@ DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
 DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
 DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
 
+DEF_TEST(BOOL, CPU, 1, 4, 1, 0)
+DEF_TEST(BOOL, CPU, 2, 4, 1, 0)
+DEF_TEST(BOOL, CPU, 2, 4, 1001, 0)
+
 DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
 DEF_TEST(INT32, CPU, 2, 4, 128, 0)
 DEF_TEST(INT64, CPU, 2, 4, 128, 0)
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 3248d3f10a7..ac3343e5a61 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -466,18 +466,6 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
                   << " src_device: " << *src_device
                   << " colo group: " << colocation_group;
         }
-        // If colocation_group is not set and output producing node is assigned
-        // to a remote device, colocate the retval node with its input node.
-        // TODO(yujingzhang): Remove this when we support outputting tensors on
-        // remote devices.
-        const bool remote_src_device =
-            !src_device->empty() && GetFLR(*src_device) == nullptr;
-        if (colocation_group.empty() && remote_src_device) {
-          colocation_group =
-              absl::StrCat(kColocationGroupPrefix, it->src()->name());
-          VLOG(3) << "Considering src: " << src_node->name()
-                  << " colo group: " << colocation_group;
-        }
 
         // If resource is produced by a function call node, we can't trust
         // source node device assignment, because multi-device functions can
@@ -510,6 +498,20 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
                   "Unable to find any devices for spec ", *src_device);
             }
           } else if (matching_devices.size() != 1) {
+            bool on_same_task = true;
+            for (int i = 1; i < matching_devices.size(); ++i) {
+              if (!DeviceNameUtils::IsSameAddressSpace(
+                      matching_devices.at(0)->parsed_name(),
+                      matching_devices.at(i)->parsed_name())) {
+                on_same_task = false;
+                break;
+              }
+            }
+            // If the src node of an output is assigned to a address space (e.g.
+            // py_func), rely on placer to assign a device to the output.
+            if (on_same_task) {
+              continue;
+            }
             // Convert a vector of devices to a string.
             // Using absl::StrJoin did not work in Android builds.
             string devices = "[";
@@ -523,6 +525,7 @@ Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
             devices.append("]");
 
             return errors::InvalidArgument(
+                *src_device,
                 "When FunctionLibraryRuntime::Options.output_devices are "
                 "not specified for a multi-device function, the device "
                 "specification on the output node must match exactly one "
@@ -968,6 +971,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
         Status s = flr->Instantiate(unique_name, attrs, opts, component_handle);
         done(s);
       } else {
+        opts.ret_indices = comp_data->ret_indices;
         // Initialize remote function asynchronously.
         InstantiateRemote(unique_name, attrs, opts, component_handle, done);
       }
@@ -988,9 +992,9 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
 }
 
 Status ProcessFunctionLibraryRuntime::GetOutputDevices(
-    FunctionLibraryRuntime::Handle handle,
-    std::vector<Device*>* output_devices) const {
-  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+    FunctionLibraryRuntime::Handle handle, std::vector<Device*>* output_devices,
+    const bool eager_lazy_copy) const {
+  MultiDeviceFunctionData* data = IsMultiDevice(handle);
   if (data == nullptr) {
     return errors::InvalidArgument(
         "Failed for find multi-device function handle ", handle);
@@ -1006,16 +1010,26 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
     const string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
     Device* target_device = nullptr;
+    Device* host = nullptr;
     if (target_flr == nullptr) {
-      // TODO(b/162618595): Remove this error once we support a remote
-      // multi-device function with remote outputs.
-      return errors::Unimplemented(
-          "Currently, outputting tensors on remote devices is not supported."
-          "The ",
-          comp_data.ret_indices[0],
-          "-th return value of the function outputs to target_device: ", target,
-          " Please copy the tensor to local device explicitly using "
-          "tf.identity and return the new Tensor instead.");
+      if (!eager_lazy_copy) {
+        return errors::Unimplemented(
+            "Currently, outputting tensors on remote devices is not supported."
+            "The ",
+            comp_data.ret_indices[0],
+            "-th return value of the function outputs to target_device: ",
+            target,
+            " Please copy the tensor to local device explicitly using "
+            "tf.identity and return the new Tensor instead.");
+      }
+      if (!data->has_remote_outputs) {
+        data->has_remote_outputs = true;
+      }
+      target_device = device_set()->FindDeviceByName(target);
+      string remote_host;
+      TF_RETURN_IF_ERROR(
+          DeviceNameUtils::DeviceNameToCpuDeviceName(target, &remote_host));
+      host = device_set()->FindDeviceByName(remote_host);
     } else {
       target_device = target_flr->device();
     }
@@ -1026,7 +1040,7 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
         (*output_devices)[ret_index] = target_device;
       } else {
         (*output_devices)[ret_index] =
-            comp_data.ret_alloc_attrs[j].on_host() ? nullptr : target_device;
+            comp_data.ret_alloc_attrs[j].on_host() ? host : target_device;
       }
     }
   }
@@ -1610,7 +1624,12 @@ void ProcessFunctionLibraryRuntime::Run(
     FunctionLibraryRuntime::Handle handle, const FunctionArgsInterface& args,
     std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
-  if (!args.HasRemoteOrPackedInputs()) {
+  bool has_remote_outputs = false;
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data != nullptr) {
+    has_remote_outputs = data->has_remote_outputs;
+  }
+  if (!args.HasRemoteOrPackedInputs() && !has_remote_outputs) {
     const std::vector<Tensor> local_inputs = args.GetLocalTensors();
     std::vector<Tensor>* tensor_rets = new std::vector<Tensor>;
     return Run(
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 3ba04f17880..a882f5406d3 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -151,7 +151,8 @@ class ProcessFunctionLibraryRuntime {
   // is set to the device backing the resource.
   // REQUIRES: `handle` identifies a multi-device function.
   Status GetOutputDevices(FunctionLibraryRuntime::Handle handle,
-                          std::vector<Device*>* output_devices) const;
+                          std::vector<Device*>* output_devices,
+                          const bool eager_lazy_copy) const;
 
   // Returns true if function with handle `handle` was instantiated on device
   // `device_name`. Returns false for multi-device functions.
@@ -203,7 +204,7 @@ class ProcessFunctionLibraryRuntime {
 
   const DeviceMgr* device_mgr() { return device_mgr_; }
 
-  const std::shared_ptr<DeviceSet> device_set() {
+  const std::shared_ptr<DeviceSet> device_set() const {
     tf_shared_lock l(mu_);
     return device_set_;
   }
@@ -271,7 +272,8 @@ class ProcessFunctionLibraryRuntime {
           lib_def_(std::move(lib_def)),
           num_outputs_(num_outputs),
           ret_types_(std::move(ret_types)),
-          is_cross_process_(false) {}
+          is_cross_process_(false),
+          has_remote_outputs(false) {}
 
     const string function_name_;
     const string function_key_;
@@ -285,6 +287,8 @@ class ProcessFunctionLibraryRuntime {
 
     // Indicates whether this function needs to execute cross process.
     bool is_cross_process_;
+    // Indicates whether this function has remote outputs.
+    bool has_remote_outputs;
 
     // Maps the device name to the information about the component function
     // be run on this device.
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 610dc1b8835..5bf769d1f3e 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -159,6 +159,16 @@ class ReplicateHelper {
     return Status::OK();
   }
 
+  void RemoveDeadReplicatedArgs(Graph* graph) {
+    for (const auto& entry : replicated_nodes_map_) {
+      for (Node* replicated_node : entry.second) {
+        if (replicated_node->IsArg() && replicated_node->out_edges().empty()) {
+          graph->RemoveNode(replicated_node);
+        }
+      }
+    }
+  }
+
  private:
   // Map from original nodes to corresponding replicated nodes.
   absl::flat_hash_map<const Node*, std::vector<Node*>> replicated_nodes_map_;
@@ -256,6 +266,8 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
     for (auto* n : cluster_nodes) {
       graph->RemoveNode(n);
     }
+
+    helper.RemoveDeadReplicatedArgs(graph);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
index 0bf2001a955..19799f90f69 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes_test.cc
@@ -31,7 +31,7 @@ namespace {
 
 class GraphHelper {
  public:
-  explicit GraphHelper(const Graph& graph) {
+  explicit GraphHelper(const Graph& graph) : graph_(graph) {
     for (Node* node : graph.nodes()) {
       nodes_by_name_[node->name()] = node;
     }
@@ -55,6 +55,16 @@ class GraphHelper {
         ->set_assigned_device_name(device_name);
   }
 
+  void CheckArgNum(const int expected_num) {
+    int arg_num = 0;
+    for (Node* node : graph_.op_nodes()) {
+      if (node->IsArg()) {
+        arg_num++;
+      }
+    }
+    EXPECT_EQ(arg_num, expected_num);
+  }
+
   void CheckAssignedDevice(const string& node_name,
                            const string& expected_device_name) {
     EXPECT_EQ(expected_device_name,
@@ -62,6 +72,7 @@ class GraphHelper {
   }
 
  private:
+  const Graph& graph_;
   // Maps from a node name to a Node* in the graph.
   absl::flat_hash_map<string, Node*> nodes_by_name_;
 };
@@ -103,6 +114,7 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDevice) {
     // ReadVariableOp(TPU:0) -> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 7);
     GraphHelper helper(graph);
+    helper.CheckArgNum(2);
     helper.CheckAssignedDevice("arg/R0", "TPU:0");
     helper.CheckAssignedDevice("arg/R1", "TPU:1");
     helper.CheckAssignedDevice("read", "TPU:0");
@@ -141,6 +153,7 @@ TEST(ReplicatePerReplicaNodesTest, SingleCompositeDeviceToSingleDevice) {
     // _Arg(TPU:0) -> ReadVariableOp(TPU:0) -> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 3);
     GraphHelper helper(graph);
+    helper.CheckArgNum(1);
     helper.CheckAssignedDevice("arg", "TPU:0");
     helper.CheckAssignedDevice("read", "TPU:0");
     helper.CheckAssignedDevice("ret", "CPU:0");
@@ -192,6 +205,7 @@ TEST(ReplicatePerReplicaNodesTest, MultipleCompositeDevices) {
     // TPU:3) -> Identity(TPU:1, TPU:3) -> Add(TPU:0)-> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 12);
     GraphHelper helper(graph);
+    helper.CheckArgNum(4);
     helper.CheckAssignedDevice("arg0/R0", "TPU:0");
     helper.CheckAssignedDevice("arg0/R1", "TPU:1");
     helper.CheckAssignedDevice("arg1/R0", "TPU:2");
@@ -261,6 +275,7 @@ TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
     // _Arg(TPU:0), _Arg(TPU:1) -> Pack(CPU:0) -> Func(CPU:0) -> _Retval(CPU:0)
     EXPECT_EQ(graph.num_op_nodes(), 5);
     GraphHelper helper(graph);
+    helper.CheckArgNum(2);
     helper.CheckAssignedDevice("arg/R0", "TPU:0");
     helper.CheckAssignedDevice("arg/R1", "TPU:1");
     helper.CheckAssignedDevice("arg/Packed", "CPU:0");
@@ -279,5 +294,41 @@ TEST(ReplicatePerReplicaNodesTest, NestedFunctions) {
   }
 }
 
+TEST(ReplicatePerReplicaNodesTest, DeadArgNodes) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output arg = ops::_Arg(scope.WithOpName("arg"), DT_RESOURCE, 0);
+  auto read = ops::ReadVariableOp(scope.WithOpName("read"), arg, DT_INT32);
+  auto ret = ops::_Retval(scope.WithOpName("ret"), read, 0);
+
+  const std::vector<string> underlying_devices = {"TPU:0", "TPU:1"};
+  const absl::flat_hash_map<string, const std::vector<string>*>
+      composite_devices = {{"TPU_COMPOSITE:0", &underlying_devices}};
+
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+  {
+    // _Arg(TPU_COMPOSITE:0) -> ReadVariableOp(TPU:0) -> _Retval(CPU:0)
+    ASSERT_EQ(graph.num_op_nodes(), 3);
+    GraphHelper helper(graph);
+    helper.SetAssignedDevice("arg", "TPU_COMPOSITE:0");
+    helper.SetAssignedDevice("read", "TPU:0");
+    helper.SetAssignedDevice("ret", "CPU:0");
+  }
+
+  TF_EXPECT_OK(
+      ReplicatePerReplicaNodesInFunctionGraph(composite_devices, &graph));
+
+  {
+    // _Arg(TPU:0) -> ReadVariableOp(TPU:0) -> _Retval(CPU:0)
+    // "arg/R1" is a dead node, so gets removed.
+    EXPECT_EQ(graph.num_op_nodes(), 3);
+    GraphHelper helper(graph);
+    helper.CheckArgNum(1);
+    helper.CheckAssignedDevice("arg/R0", "TPU:0");
+    helper.CheckAssignedDevice("read", "TPU:0");
+    helper.CheckAssignedDevice("ret", "CPU:0");
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index 753f6ba982e..870429bd883 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -384,7 +384,7 @@ void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
   int send_to_rank = (rf->rank + 1) % group_size_;
   int send_to_dev_idx = col_params_->instance.impl_details
                             .subdiv_permutations[rf->subdiv_idx][send_to_rank];
-  col_ctx_->col_exec->PostToPeer(
+  col_ctx_->col_exec->remote_access()->PostToPeer(
       col_params_->instance.device_names[send_to_dev_idx],
       col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
@@ -403,7 +403,7 @@ void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
   Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
                            ? &rf->tmp_chunk
                            : &rf->chunk;
-  col_ctx_->col_exec->RecvFromPeer(
+  col_ctx_->col_exec->remote_access()->RecvFromPeer(
       col_params_->instance.device_names[rf->recv_dev_idx],
       col_params_->instance.task_names[rf->recv_dev_idx],
       col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 3e70f523ff5..5d7a68156dd 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -45,9 +45,8 @@ namespace tensorflow {
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -173,10 +172,11 @@ class RingGathererTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     static const int kGroupKey = 5;
     col_params_.group.group_key = kGroupKey;
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index a7f99cf0f45..11157d49ae8 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -45,9 +45,8 @@ namespace tensorflow {
 class FailTestRMA : public CollectiveRemoteAccessLocal {
  public:
   FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-              std::shared_ptr<UnboundedWorkQueue> work_queue, int64 step_id,
-              int fail_after)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         fail_after_(fail_after) {}
 
   bool MaybeFail(const StatusCallback& done) {
@@ -195,10 +194,11 @@ class RingReducerTest : public ::testing::Test {
     }
     dev_resolver_ = absl::make_unique<DeviceResolverLocal>(dev_mgr_.get());
     work_queue_ = std::make_shared<UnboundedWorkQueue>(Env::Default(), "test");
-    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), work_queue_,
-                           kStepId, fail_after);
-    col_exec_ = new BaseCollectiveExecutor(
-        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
+                                           dev_mgr_.get(),
+                                           gpu_ring_order_.get(), work_queue_);
     col_params_.name = "test_collective";
     static const int kGroupKey = 5;
     col_params_.group.group_key = kGroupKey;
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
index 22694120403..c2e6d2ae08c 100644
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -28,25 +28,6 @@ class TestCollectiveExecutor : public CollectiveExecutor {
  public:
   explicit TestCollectiveExecutor(CollectiveExecutorMgrInterface* cem)
       : CollectiveExecutor(cem) {}
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
-                    DeviceContext* to_device_ctx,
-                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-                    const DeviceLocality& client_locality,
-                    int dev_to_dev_stream_index,
-                    const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
-
-  void PostToPeer(const string& peer_device, const string& peer_task,
-                  const string& key, Device* from_device,
-                  DeviceContext* from_device_ctx,
-                  const AllocatorAttributes& from_alloc_attr,
-                  const Tensor* from_tensor,
-                  const DeviceLocality& client_locality,
-                  const StatusCallback& done) override {
-    done(errors::Internal("Unimplemented"));
-  }
 
   void RunClosure(std::function<void()>) override {
     LOG(FATAL) << "Unimplemented";
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index da41c71b397..35971e39ea1 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -56,6 +56,7 @@ cc_library(
     deps = [
         ":common_proto_cc",
         ":dispatcher_state",
+        ":utils",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -93,9 +94,6 @@ cc_library(
         ":grpc_util",
         ":journal",
         ":worker_cc_grpc_proto",
-        ":worker_proto_cc",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework_internal",
@@ -151,9 +149,11 @@ cc_library(
     deps = [
         ":common_proto_cc",
         ":credentials_factory",
+        ":data_service",
         ":dispatcher_cc_grpc_proto",
         ":dispatcher_proto_cc",
         ":grpc_util",
+        ":utils",
         ":worker_proto_cc",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
@@ -368,6 +368,32 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    deps = [
+        ":common_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "data_service",
     srcs = ["data_service.cc"],
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index aeeb1371171..64fced1d13c 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -12,11 +12,13 @@ message DatasetDef {
 
 message TaskDef {
   // The dataset to iterate over.
-  // TODO(aaudibert): load the dataset from disk instead of passing it here.
-  DatasetDef dataset = 1;
-  int64 dataset_id = 2;
-  int64 task_id = 3;
-  int64 job_id = 4;
+  oneof dataset {
+    DatasetDef dataset_def = 1;
+    string path = 2;
+  }
+  int64 dataset_id = 3;
+  int64 task_id = 4;
+  int64 job_id = 5;
 }
 
 message TaskInfo {
diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
index 31449f6f5ec..0f25805b653 100644
--- a/tensorflow/core/data/service/data_service.cc
+++ b/tensorflow/core/data/service/data_service.cc
@@ -54,6 +54,56 @@ std::string ProcessingModeToString(ProcessingMode mode) {
   }
 }
 
+Status DataServiceDispatcherClient::RegisterWorker(
+    const std::string& worker_address, std::vector<TaskDef>& tasks) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  RegisterWorkerRequest req;
+  req.set_worker_address(worker_address);
+  RegisterWorkerResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->RegisterWorker(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to register worker", status);
+  }
+  for (const auto& task : resp.tasks()) {
+    tasks.push_back(task);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherClient::WorkerUpdate(
+    const std::string& worker_address,
+    std::vector<TaskProgress>& task_progress) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  WorkerUpdateRequest req;
+  req.set_worker_address(worker_address);
+  for (const auto& update : task_progress) {
+    *(req.add_updates()) = update;
+  }
+  WorkerUpdateResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->WorkerUpdate(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to send worker update", status);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherClient::GetDatasetDef(int64 dataset_id,
+                                                  DatasetDef& dataset_def) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetDatasetDefRequest req;
+  req.set_dataset_id(dataset_id);
+  GetDatasetDefResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->GetDatasetDef(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to get dataset def", status);
+  }
+  dataset_def = resp.dataset_def();
+  return Status::OK();
+}
+
 Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
                                                     int64* dataset_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
@@ -71,7 +121,7 @@ Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
 
 Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
                                               ProcessingMode processing_mode,
-                                              int64* job_id) {
+                                              int64* job_client_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   CreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -84,13 +134,13 @@ Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
         absl::StrCat("Failed to create job for dataset with id ", dataset_id),
         status);
   }
-  *job_id = resp.job_id();
+  *job_client_id = resp.job_client_id();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::GetOrCreateJob(
     int64 dataset_id, ProcessingMode processing_mode,
-    const std::string& job_name, int job_name_index, int64* job_id) {
+    const std::string& job_name, int job_name_index, int64* job_client_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrCreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -106,16 +156,31 @@ Status DataServiceDispatcherClient::GetOrCreateJob(
                      dataset_id),
         status);
   }
-  *job_id = resp.job_id();
+  *job_client_id = resp.job_client_id();
   return Status::OK();
 }
 
-Status DataServiceDispatcherClient::GetTasks(int64 job_id,
+Status DataServiceDispatcherClient::ReleaseJobClient(int64 job_client_id) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  ReleaseJobClientRequest req;
+  req.set_job_client_id(job_client_id);
+  ReleaseJobClientResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->ReleaseJobClient(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError(
+        absl::StrCat("Failed to release job client with id ", job_client_id),
+        status);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherClient::GetTasks(int64 job_client_id,
                                              std::vector<TaskInfo>* tasks,
                                              bool* job_finished) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetTasksRequest req;
-  req.set_job_id(job_id);
+  req.set_job_client_id(job_client_id);
   GetTasksResponse resp;
   grpc::ClientContext ctx;
   grpc::Status s = stub_->GetTasks(&ctx, req, &resp);
@@ -148,6 +213,10 @@ Status DataServiceDispatcherClient::GetWorkers(
 }
 
 Status DataServiceDispatcherClient::EnsureInitialized() {
+  mutex_lock l(mu_);
+  if (stub_) {
+    return Status::OK();
+  }
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
@@ -176,6 +245,10 @@ Status DataServiceWorkerClient::GetElement(int64 task_id,
 }
 
 Status DataServiceWorkerClient::EnsureInitialized() {
+  mutex_lock l(mu_);
+  if (stub_) {
+    return Status::OK();
+  }
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
index d0e46c82ff5..621e76da749 100644
--- a/tensorflow/core/data/service/data_service.h
+++ b/tensorflow/core/data/service/data_service.h
@@ -41,8 +41,7 @@ Status ParseProcessingMode(const std::string& s, ProcessingMode* mode);
 std::string ProcessingModeToString(ProcessingMode mode);
 
 // Base class for data service clients. Data service clients are
-// thread-compatible, requiring external synchronization when used from multiple
-// threads.
+// threadsafe.
 class DataServiceClientBase {
  public:
   DataServiceClientBase(const std::string& address, const std::string& protocol)
@@ -74,26 +73,43 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
                               const std::string& protocol)
       : DataServiceClientBase(address, protocol) {}
 
+  // Registers a worker with the dispatcher. The dispatcher returns a list of
+  // initial tasks for the worker to run, storing them in `tasks`.
+  Status RegisterWorker(const std::string& worker_address,
+                        std::vector<TaskDef>& tasks);
+
+  // Updates the dispatcher with information about the worker's state.
+  Status WorkerUpdate(const std::string& worker_address,
+                      std::vector<TaskProgress>& task_progress);
+
+  // Gets a dataset definition for the given dataset id, and stores the
+  // definition in `dataset_def`.
+  Status GetDatasetDef(int64 dataset_id, DatasetDef& dataset_def);
+
   // Registers a dataset with the tf.data service, and stores the generated
   // dataset id in `*dataset_id`.
   Status RegisterDataset(GraphDef dataset, int64* dataset_id);
 
   // Creates a new tf.data service job for the specified dataset. The id for the
-  // created job will be stored in `*job_id`.
+  // created job will be stored in `*job_client_id`.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
-                   int64* job_id);
+                   int64* job_client_id);
 
   // Gets the job id for the job represented by the tuple
-  // (job_name, job_name_index), and stores the id in *job_id. If the
+  // (job_name, job_name_index), and stores the id in *job_client_id. If the
   // job doesn't exist yet, it will be created.
   Status GetOrCreateJob(int64 dataset_id, ProcessingMode processing_mode,
                         const std::string& job_name, int job_name_index,
-                        int64* job_id);
+                        int64* job_client_id);
+
+  // Releases a job client id, indicating that the id will no longer be used to
+  // read from the job.
+  Status ReleaseJobClient(int64 job_client_id);
 
   // Queries the dispatcher for the tasks associated with the specified job.
   // The tasks will be stored in *tasks, and whether the job is finished will
   // be stored in `*job_finished`.
-  Status GetTasks(int64 job_id, std::vector<TaskInfo>* tasks,
+  Status GetTasks(int64 job_client_id, std::vector<TaskInfo>* tasks,
                   bool* job_finished);
 
   // Queries the dispatcher for its registered workers. The worker info will be
@@ -104,6 +120,9 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
   Status EnsureInitialized() override;
 
  private:
+  mutex mu_;
+  // Initialization is guarded by `mu_`, but using the stub does not require
+  // holding `mu_`
   std::unique_ptr<DispatcherService::Stub> stub_;
 };
 
@@ -124,6 +143,9 @@ class DataServiceWorkerClient : public DataServiceClientBase {
   Status EnsureInitialized() override;
 
  private:
+  mutex mu_;
+  // Initialization is guarded by `mu_`, but using the stub does not require
+  // holding `mu_`
   std::unique_ptr<WorkerService::Stub> stub_;
 };
 
diff --git a/tensorflow/core/data/service/dataset_store.cc b/tensorflow/core/data/service/dataset_store.cc
index 1cb10508555..abb577c9f3f 100644
--- a/tensorflow/core/data/service/dataset_store.cc
+++ b/tensorflow/core/data/service/dataset_store.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/utils.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/platform/env.h"
@@ -39,10 +40,7 @@ Status FileSystemDatasetStore::Put(const std::string& key,
   if (Env::Default()->FileExists(path_to_write).ok()) {
     return errors::AlreadyExists("File ", path_to_write, " already exists");
   }
-  std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(path_to_write, &file));
-  io::RecordWriter writer(file.get());
-  TF_RETURN_IF_ERROR(writer.WriteRecord(dataset.SerializeAsString()));
+  TF_RETURN_IF_ERROR(WriteDatasetDef(path_to_write, dataset));
   return Status::OK();
 }
 
@@ -50,18 +48,9 @@ Status FileSystemDatasetStore::Get(
     const std::string& key, std::shared_ptr<const DatasetDef>& dataset_def) {
   std::string path = io::JoinPath(datasets_dir_, key);
   TF_RETURN_IF_ERROR(Env::Default()->FileExists(path));
-  std::unique_ptr<RandomAccessFile> file;
-  TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(path, &file));
-  io::RecordReader reader(file.get());
-  uint64 offset = 0;
-  tstring record;
-  TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
-  dataset_def = std::make_shared<const DatasetDef>();
-  auto def = std::make_shared<DatasetDef>();
-  if (!def->ParseFromString(record)) {
-    return errors::DataLoss("Failed to parse dataset definition");
-  }
-  dataset_def = std::move(def);
+  DatasetDef def;
+  TF_RETURN_IF_ERROR(ReadDatasetDef(path, def));
+  dataset_def = std::make_shared<const DatasetDef>(def);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/dispatcher.proto b/tensorflow/core/data/service/dispatcher.proto
index 057fc58de52..cf8c4c20c70 100644
--- a/tensorflow/core/data/service/dispatcher.proto
+++ b/tensorflow/core/data/service/dispatcher.proto
@@ -28,6 +28,14 @@ message WorkerUpdateRequest {
 
 message WorkerUpdateResponse {}
 
+message GetDatasetDefRequest {
+  int64 dataset_id = 1;
+}
+
+message GetDatasetDefResponse {
+  DatasetDef dataset_def = 1;
+}
+
 message GetOrRegisterDatasetRequest {
   // The dataset to register.
   DatasetDef dataset = 1;
@@ -46,8 +54,9 @@ message CreateJobRequest {
 }
 
 message CreateJobResponse {
-  // An id for the created job.
-  int64 job_id = 1;
+  // An id for the client that will read from the job. When the client is done
+  // with the job, they should call ReleaseJobClient with this id.
+  int64 job_client_id = 1;
 }
 
 message GetOrCreateJobRequest {
@@ -63,13 +72,20 @@ message GetOrCreateJobRequest {
 }
 
 message GetOrCreateJobResponse {
-  // The id of the (potentially newly created) job.
-  int64 job_id = 1;
+  // An id for the client that will read from the job. When the client is done
+  // with the job, they should call ReleaseJobClient with this id.
+  int64 job_client_id = 1;
 }
 
+message ReleaseJobClientRequest {
+  int64 job_client_id = 1;
+}
+
+message ReleaseJobClientResponse {}
+
 message GetTasksRequest {
-  // The job to look up tasks for.
-  int64 job_id = 1;
+  // The job client id to look up tasks for.
+  int64 job_client_id = 1;
 }
 
 message GetTasksResponse {
@@ -100,6 +116,9 @@ service DispatcherService {
   // Updates the dispatcher with information about the worker's state.
   rpc WorkerUpdate(WorkerUpdateRequest) returns (WorkerUpdateResponse);
 
+  // Gets a dataset defintion.
+  rpc GetDatasetDef(GetDatasetDefRequest) returns (GetDatasetDefResponse);
+
   // Registers a dataset with the server, or returns its id if it is already
   // registered.
   //
@@ -114,6 +133,10 @@ service DispatcherService {
   // Creates a job for reading from the tf.data service.
   rpc CreateJob(CreateJobRequest) returns (CreateJobResponse);
 
+  // Releases a job client so that a job may eventually be cleaned up.
+  rpc ReleaseJobClient(ReleaseJobClientRequest)
+      returns (ReleaseJobClientResponse);
+
   // Reports a list of all tasks for a job.
   rpc GetTasks(GetTasksRequest) returns (GetTasksResponse);
 
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 6d504ebca81..de5f63a01a0 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
+#ifdef PLATFORM_GOOGLE
+#include "file/logging/log_lines.h"
+#endif
 #include "grpcpp/create_channel.h"
 #include "grpcpp/impl/codegen/server_context.h"
 #include "grpcpp/security/credentials.h"
@@ -93,7 +96,12 @@ DataServiceDispatcherImpl::DataServiceDispatcherImpl(
 
 Status DataServiceDispatcherImpl::Start() {
   mutex_lock l(mu_);
-  if (!config_.work_dir().empty()) {
+  if (config_.work_dir().empty()) {
+    if (config_.fault_tolerant_mode()) {
+      return errors::InvalidArgument(
+          "fault_tolerant_mode is True, but no work_dir is configured.");
+    }
+  } else {
     TF_RETURN_IF_ERROR(
         Env::Default()->RecursivelyCreateDir(DatasetsDir(config_.work_dir())));
   }
@@ -102,10 +110,6 @@ Status DataServiceDispatcherImpl::Start() {
                  "not be able to recover its state on restart.";
     return Status::OK();
   }
-  if (config_.work_dir().empty()) {
-    return errors::InvalidArgument(
-        "fault_tolerant_mode is True, but no work_dir is configured.");
-  }
   journal_writer_ = absl::make_unique<FileJournalWriter>(
       Env::Default(), JournalDir(config_.work_dir()));
   LOG(INFO) << "Restoring dispatcher state from journal in "
@@ -169,10 +173,17 @@ Status DataServiceDispatcherImpl::RegisterWorker(
     TaskDef* task_def = response->add_tasks();
     std::shared_ptr<const Dataset> dataset;
     TF_RETURN_IF_ERROR(state_.DatasetFromId(job->dataset_id, &dataset));
-    std::shared_ptr<const DatasetDef> dataset_def;
-    TF_RETURN_IF_ERROR(dataset_store_->Get(
-        DatasetKey(dataset->dataset_id, dataset->fingerprint), dataset_def));
-    *(task_def->mutable_dataset()) = *dataset_def;
+    std::string dataset_key =
+        DatasetKey(dataset->dataset_id, dataset->fingerprint);
+    if (config_.work_dir().empty()) {
+      std::shared_ptr<const DatasetDef> dataset_def;
+      TF_RETURN_IF_ERROR(dataset_store_->Get(dataset_key, dataset_def));
+      *task_def->mutable_dataset_def() = *dataset_def;
+    } else {
+      std::string path =
+          io::JoinPath(DatasetsDir(config_.work_dir()), dataset_key);
+      task_def->set_path(path);
+    }
     task_def->set_dataset_id(job->dataset_id);
     task_def->set_job_id(job->job_id);
     task_def->set_task_id(task->task_id);
@@ -205,14 +216,31 @@ Status DataServiceDispatcherImpl::WorkerUpdate(
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::GetDatasetDef(
+    const GetDatasetDefRequest* request, GetDatasetDefResponse* response) {
+  mutex_lock l(mu_);
+  std::shared_ptr<const Dataset> dataset;
+  TF_RETURN_IF_ERROR(state_.DatasetFromId(request->dataset_id(), &dataset));
+  std::string key = DatasetKey(dataset->dataset_id, dataset->fingerprint);
+  std::shared_ptr<const DatasetDef> dataset_def;
+  TF_RETURN_IF_ERROR(dataset_store_->Get(key, dataset_def));
+  *response->mutable_dataset_def() = *dataset_def;
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::GetOrRegisterDataset(
     const GetOrRegisterDatasetRequest* request,
     GetOrRegisterDatasetResponse* response) {
   uint64 fingerprint;
-  TF_RETURN_IF_ERROR(HashGraph(request->dataset().graph(), &fingerprint));
+  const GraphDef& graph = request->dataset().graph();
+  TF_RETURN_IF_ERROR(HashGraph(graph, &fingerprint));
   mutex_lock l(mu_);
-  VLOG(4) << "Registering dataset graph: "
-          << request->dataset().graph().DebugString();
+#if defined(PLATFORM_GOOGLE)
+  VLOG_LINES(4,
+             absl::StrCat("Registering dataset graph: ", graph.DebugString()));
+#else
+  VLOG(4) << "Registering dataset graph: " << graph.DebugString();
+#endif
   std::shared_ptr<const Dataset> dataset;
   Status s = state_.DatasetFromFingerprint(fingerprint, &dataset);
   if (s.ok()) {
@@ -257,9 +285,11 @@ Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
                                  absl::optional<NamedJobKey>(), &job));
+    int64 job_client_id;
+    TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
+    response->set_job_client_id(job_client_id);
     TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
   }
-  response->set_job_id(job->job_id);
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
 
   VLOG(3) << "Creating job " << job->job_id << " for dataset "
@@ -283,7 +313,9 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
     if (s.ok()) {
       TF_RETURN_IF_ERROR(ValidateMatchingJob(job, requested_processing_mode,
                                              request->dataset_id()));
-      response->set_job_id(job->job_id);
+      int64 job_client_id;
+      TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
+      response->set_job_client_id(job_client_id);
       VLOG(3) << "Found existing job for name=" << key.name
               << ", index=" << key.index << ". job_id: " << job->job_id;
       return Status::OK();
@@ -292,15 +324,33 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
     }
     TF_RETURN_IF_ERROR(
         CreateJob(request->dataset_id(), requested_processing_mode, key, &job));
+    int64 job_client_id;
+    TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
+    response->set_job_client_id(job_client_id);
     TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
   }
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
-  response->set_job_id(job->job_id);
   VLOG(3) << "Created job " << job->job_id << " for dataset "
           << request->dataset_id() << " and name " << request->job_name();
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::ReleaseJobClient(
+    const ReleaseJobClientRequest* request,
+    ReleaseJobClientResponse* response) {
+  mutex_lock l(mu_);
+  int64 job_client_id = request->job_client_id();
+  std::shared_ptr<const Job> job;
+  TF_RETURN_IF_ERROR(state_.JobForJobClientId(job_client_id, job));
+  Update update;
+  ReleaseJobClientUpdate* release_job_client =
+      update.mutable_release_job_client();
+  release_job_client->set_job_client_id(job_client_id);
+  release_job_client->set_time_micros(Env::Default()->NowMicros());
+  TF_RETURN_IF_ERROR(Apply(update));
+  return Status::OK();
+}
+
 // Validates that the job matches the given processing_mode and dataset_id.
 Status DataServiceDispatcherImpl::ValidateMatchingJob(
     std::shared_ptr<const Job> job, ProcessingMode processing_mode,
@@ -356,6 +406,19 @@ Status DataServiceDispatcherImpl::CreateJob(
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::AcquireJobClientId(
+    const std::shared_ptr<const Job>& job, int64& job_client_id)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  job_client_id = state_.NextAvailableJobClientId();
+  Update update;
+  AcquireJobClientUpdate* acquire_job_client =
+      update.mutable_acquire_job_client();
+  acquire_job_client->set_job_client_id(job_client_id);
+  acquire_job_client->set_job_id(job->job_id);
+  TF_RETURN_IF_ERROR(Apply(update));
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::CreateTasksForJob(
     std::shared_ptr<const Job> job,
     std::vector<std::shared_ptr<const Task>>* tasks)
@@ -423,6 +486,8 @@ Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
 
 Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
     LOCKS_EXCLUDED(mu_) {
+  VLOG(2) << "Started assigning task " << task->task_id << " to worker "
+          << task->worker_address;
   grpc::ClientContext client_ctx;
   ProcessTaskRequest req;
   TaskDef* task_def = req.mutable_task();
@@ -431,10 +496,17 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
     mutex_lock l(mu_);
     std::shared_ptr<const Dataset> dataset;
     TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, &dataset));
-    std::shared_ptr<const DatasetDef> dataset_def;
-    TF_RETURN_IF_ERROR(dataset_store_->Get(
-        DatasetKey(dataset->dataset_id, dataset->fingerprint), dataset_def));
-    *task_def->mutable_dataset() = *dataset_def;
+    std::string dataset_key =
+        DatasetKey(dataset->dataset_id, dataset->fingerprint);
+    if (config_.work_dir().empty()) {
+      std::shared_ptr<const DatasetDef> dataset_def;
+      TF_RETURN_IF_ERROR(dataset_store_->Get(dataset_key, dataset_def));
+      *task_def->mutable_dataset_def() = *dataset_def;
+    } else {
+      std::string path =
+          io::JoinPath(DatasetsDir(config_.work_dir()), dataset_key);
+      task_def->set_path(path);
+    }
   }
   task_def->set_task_id(task->task_id);
   ProcessTaskResponse resp;
@@ -446,26 +518,28 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
         absl::StrCat("Failed to submit task to worker ", task->worker_address),
         s);
   }
+  VLOG(2) << "Finished assigning task " << task->task_id << " to worker "
+          << task->worker_address;
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::GetTasks(const GetTasksRequest* request,
                                            GetTasksResponse* response) {
   mutex_lock l(mu_);
-  VLOG(3) << "Looking up tasks for job id " << request->job_id();
+  VLOG(3) << "Looking up tasks for job client id " << request->job_client_id();
+  std::shared_ptr<const Job> job;
+  TF_RETURN_IF_ERROR(state_.JobForJobClientId(request->job_client_id(), job));
   std::vector<std::shared_ptr<const Task>> tasks;
-  TF_RETURN_IF_ERROR(state_.TasksForJob(request->job_id(), &tasks));
+  TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, &tasks));
   for (const auto& task : tasks) {
     TaskInfo* task_info = response->mutable_task_info()->Add();
     task_info->set_worker_address(task->worker_address);
     task_info->set_task_id(task->task_id);
-    task_info->set_job_id(task->job_id);
+    task_info->set_job_id(job->job_id);
   }
-  std::shared_ptr<const Job> job;
-  TF_RETURN_IF_ERROR(state_.JobFromId(request->job_id(), &job));
   response->set_job_finished(job->finished);
-  VLOG(3) << "Found " << response->task_info_size() << " tasks for job id "
-          << request->job_id();
+  VLOG(3) << "Found " << response->task_info_size()
+          << " tasks for job client id " << request->job_client_id();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
index 2533e96d7ef..34cdc678183 100644
--- a/tensorflow/core/data/service/dispatcher_impl.h
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -59,6 +59,8 @@ class DataServiceDispatcherImpl {
                         RegisterWorkerResponse* response);
   Status WorkerUpdate(const WorkerUpdateRequest* request,
                       WorkerUpdateResponse* response);
+  Status GetDatasetDef(const GetDatasetDefRequest* request,
+                       GetDatasetDefResponse* response);
 
   /// Client-facing API.
   Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
@@ -67,6 +69,8 @@ class DataServiceDispatcherImpl {
                    CreateJobResponse* response);
   Status GetOrCreateJob(const GetOrCreateJobRequest* request,
                         GetOrCreateJobResponse* response);
+  Status ReleaseJobClient(const ReleaseJobClientRequest* request,
+                          ReleaseJobClientResponse* response);
   Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
   Status GetWorkers(const GetWorkersRequest* request,
                     GetWorkersResponse* response);
@@ -87,6 +91,11 @@ class DataServiceDispatcherImpl {
                    absl::optional<DispatcherState::NamedJobKey> named_job_key,
                    std::shared_ptr<const DispatcherState::Job>* job)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Acquires a job client id to read from the given job and sets
+  // `job_client_id`.
+  Status AcquireJobClientId(
+      const std::shared_ptr<const DispatcherState::Job>& job,
+      int64& job_client_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Creates one task for each worker, for the given job. The created tasks are
   // stored in `*tasks`. This method only updates dispatcher metadata with the
   // new tasks, but doesn't assign the tasks to the workers.
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
index 19c1c1c9de5..b302810f715 100644
--- a/tensorflow/core/data/service/dispatcher_state.cc
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -36,6 +36,12 @@ Status DispatcherState::Apply(Update update) {
     case Update::kCreateJob:
       CreateJob(update.create_job());
       break;
+    case Update::kAcquireJobClient:
+      AcquireJobClient(update.acquire_job_client());
+      break;
+    case Update::kReleaseJobClient:
+      ReleaseJobClient(update.release_job_client());
+      break;
     case Update::kCreateTask:
       CreateTask(update.create_task());
       break;
@@ -89,6 +95,29 @@ void DispatcherState::CreateJob(const CreateJobUpdate& create_job) {
   next_available_job_id_ = std::max(next_available_job_id_, job_id + 1);
 }
 
+void DispatcherState::AcquireJobClient(
+    const AcquireJobClientUpdate& acquire_job_client) {
+  int64 job_client_id = acquire_job_client.job_client_id();
+  std::shared_ptr<Job>& job = jobs_for_client_ids_[job_client_id];
+  DCHECK(!job);
+  job = jobs_[acquire_job_client.job_id()];
+  DCHECK(job);
+  job->num_clients++;
+  next_available_job_client_id_ =
+      std::max(next_available_job_client_id_, job_client_id + 1);
+}
+
+void DispatcherState::ReleaseJobClient(
+    const ReleaseJobClientUpdate& release_job_client) {
+  int64 job_client_id = release_job_client.job_client_id();
+  std::shared_ptr<Job>& job = jobs_for_client_ids_[job_client_id];
+  DCHECK(job);
+  job->num_clients--;
+  DCHECK_GE(job->num_clients, 0);
+  job->last_client_released_micros = release_job_client.time_micros();
+  jobs_for_client_ids_.erase(job_client_id);
+}
+
 void DispatcherState::CreateTask(const CreateTaskUpdate& create_task) {
   int64 task_id = create_task.task_id();
   auto& task = tasks_[task_id];
@@ -196,6 +225,19 @@ int64 DispatcherState::NextAvailableJobId() const {
   return next_available_job_id_;
 }
 
+Status DispatcherState::JobForJobClientId(int64 job_client_id,
+                                          std::shared_ptr<const Job>& job) {
+  job = jobs_for_client_ids_[job_client_id];
+  if (!job) {
+    return errors::NotFound("Job client id not found: ", job_client_id);
+  }
+  return Status::OK();
+}
+
+int64 DispatcherState::NextAvailableJobClientId() const {
+  return next_available_job_client_id_;
+}
+
 Status DispatcherState::TaskFromId(int64 id,
                                    std::shared_ptr<const Task>* task) const {
   auto it = tasks_.find(id);
diff --git a/tensorflow/core/data/service/dispatcher_state.h b/tensorflow/core/data/service/dispatcher_state.h
index 70b91d634d8..d2080c8e10c 100644
--- a/tensorflow/core/data/service/dispatcher_state.h
+++ b/tensorflow/core/data/service/dispatcher_state.h
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace data {
 
 // A class encapsulating the journaled state of the dispatcher. All state
-// modifications must be done via `ApplyUpdate`. This helps to ensure that
+// modifications must be done via `Apply`. This helps to ensure that
 // replaying the journal will allow us to restore the exact same state.
 //
 // The following usage pattern will keep the journal in sync with the state of
@@ -34,7 +34,7 @@ namespace data {
 // {
 //   mutex_lock l(mu_);
 //   Update update = ...  // create an update
-//   dispatcher_state.ApplyUpdate(update);
+//   dispatcher_state.Apply(update);
 //   journal_writer.write(Update);
 //   // Unlock mu_
 // }
@@ -106,6 +106,8 @@ class DispatcherState {
     const int64 dataset_id;
     const ProcessingMode processing_mode;
     const absl::optional<NamedJobKey> named_job_key;
+    int64 num_clients = 0;
+    int64 last_client_released_micros = -1;
     bool finished = false;
   };
 
@@ -148,6 +150,13 @@ class DispatcherState {
   // Gets a named job by key. Returns NOT_FOUND if there is no such job.
   Status NamedJobByKey(NamedJobKey key, std::shared_ptr<const Job>* job) const;
 
+  // Returns the job associated with the given job client id. Returns NOT_FOUND
+  // if the job_client_id is unknown or has been released.
+  Status JobForJobClientId(int64 job_client_id,
+                           std::shared_ptr<const Job>& job);
+  // Returns the next available job client id.
+  int64 NextAvailableJobClientId() const;
+
   // Returns the next available task id.
   int64 NextAvailableTaskId() const;
   // Gets a task by id. Returns NOT_FOUND if there is no such task.
@@ -165,6 +174,8 @@ class DispatcherState {
   void RegisterDataset(const RegisterDatasetUpdate& register_dataset);
   void RegisterWorker(const RegisterWorkerUpdate& register_worker);
   void CreateJob(const CreateJobUpdate& create_job);
+  void AcquireJobClient(const AcquireJobClientUpdate& acquire_job_client);
+  void ReleaseJobClient(const ReleaseJobClientUpdate& release_job_client);
   void CreateTask(const CreateTaskUpdate& create_task);
   void FinishTask(const FinishTaskUpdate& finish_task);
 
@@ -185,6 +196,10 @@ class DispatcherState {
   // this is a subset of the jobs stored in `jobs_`.
   absl::flat_hash_map<NamedJobKey, std::shared_ptr<Job>> named_jobs_;
 
+  int64 next_available_job_client_id_ = 0;
+  // Mapping from client ids to the jobs they are associated with.
+  absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_for_client_ids_;
+
   int64 next_available_task_id_ = 0;
   // Tasks, keyed by task ids.
   absl::flat_hash_map<int64, std::shared_ptr<Task>> tasks_;
diff --git a/tensorflow/core/data/service/dispatcher_state_test.cc b/tensorflow/core/data/service/dispatcher_state_test.cc
index e0befb576a5..1676fc704f4 100644
--- a/tensorflow/core/data/service/dispatcher_state_test.cc
+++ b/tensorflow/core/data/service/dispatcher_state_test.cc
@@ -81,6 +81,28 @@ Status CreateNamedJob(int64 job_id, int64 dataset_id, NamedJobKey named_job_key,
   return Status::OK();
 }
 
+Status AcquireJobClientId(int64 job_id, int64 job_client_id,
+                          DispatcherState* state) {
+  Update update;
+  AcquireJobClientUpdate* acquire_job_client =
+      update.mutable_acquire_job_client();
+  acquire_job_client->set_job_id(job_id);
+  acquire_job_client->set_job_client_id(job_client_id);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
+Status ReleaseJobClientId(int64 job_client_id, int64 release_time,
+                          DispatcherState* state) {
+  Update update;
+  ReleaseJobClientUpdate* release_job_client =
+      update.mutable_release_job_client();
+  release_job_client->set_job_client_id(job_client_id);
+  release_job_client->set_time_micros(release_time);
+  TF_RETURN_IF_ERROR(state->Apply(update));
+  return Status::OK();
+}
+
 Status CreateTask(int64 task_id, int64 job_id, int64 dataset_id,
                   const std::string& worker_address, DispatcherState* state) {
   Update update;
@@ -400,5 +422,50 @@ TEST(DispatcherState, FinishMultiTaskJob) {
   }
 }
 
+TEST(DispatcherState, AcquireJobClientId) {
+  int64 job_id = 3;
+  int64 job_client_id_1 = 1;
+  int64 job_client_id_2 = 2;
+  int64 dataset_id = 10;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_1, &state));
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    EXPECT_EQ(job->num_clients, 1);
+    TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_2, &state));
+    EXPECT_EQ(job->num_clients, 2);
+  }
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobForJobClientId(job_client_id_1, job));
+    EXPECT_EQ(job->job_id, job_id);
+  }
+  {
+    std::shared_ptr<const Job> job;
+    TF_EXPECT_OK(state.JobForJobClientId(job_client_id_2, job));
+    EXPECT_EQ(job->job_id, job_id);
+  }
+}
+
+TEST(DispatcherState, ReleaseJobClientId) {
+  int64 job_id = 3;
+  int64 dataset_id = 10;
+  int64 job_client_id = 6;
+  int64 release_time = 100;
+  DispatcherState state;
+  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id, &state));
+  TF_EXPECT_OK(ReleaseJobClientId(job_client_id, release_time, &state));
+  std::shared_ptr<const Job> job;
+  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  EXPECT_EQ(job->num_clients, 0);
+  Status s = state.JobForJobClientId(job_client_id, job);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.cc b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
index f62b487fcdf..a7a30798a93 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
@@ -42,8 +42,10 @@ Status GrpcDispatcherImpl::Start() { return impl_.Start(); }
   }
 HANDLER(RegisterWorker);
 HANDLER(WorkerUpdate);
+HANDLER(GetDatasetDef);
 HANDLER(GetOrRegisterDataset);
 HANDLER(CreateJob);
+HANDLER(ReleaseJobClient);
 HANDLER(GetOrCreateJob);
 HANDLER(GetTasks);
 HANDLER(GetWorkers);
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.h b/tensorflow/core/data/service/grpc_dispatcher_impl.h
index 7e8910b1680..81f1cbf6f02 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.h
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -47,8 +47,10 @@ class GrpcDispatcherImpl : public DispatcherService::Service {
                         method##Response* response) override;
   HANDLER(RegisterWorker);
   HANDLER(WorkerUpdate);
+  HANDLER(GetDatasetDef);
   HANDLER(GetOrRegisterDataset);
   HANDLER(CreateJob);
+  HANDLER(ReleaseJobClient);
   HANDLER(GetOrCreateJob);
   HANDLER(GetTasks);
   HANDLER(GetWorkers);
diff --git a/tensorflow/core/data/service/grpc_util.cc b/tensorflow/core/data/service/grpc_util.cc
index c86496c130a..73ea384ea60 100644
--- a/tensorflow/core/data/service/grpc_util.cc
+++ b/tensorflow/core/data/service/grpc_util.cc
@@ -57,8 +57,8 @@ Status Retry(const std::function<Status()>& f, const std::string& description,
         std::min(deadline_with_backoff_micros, deadline_micros);
     int64 wait_time_micros = backoff_until - now_micros;
     if (wait_time_micros > 100 * 1000) {
-      LOG(INFO) << "Failed to " << description << ". Will retry in "
-                << wait_time_micros / 1000 << "ms.";
+      LOG(INFO) << "Failed to " << description << ": " << s
+                << ". Will retry in " << wait_time_micros / 1000 << "ms.";
     }
     Env::Default()->SleepForMicroseconds(wait_time_micros);
     s = f();
diff --git a/tensorflow/core/data/service/journal.proto b/tensorflow/core/data/service/journal.proto
index 5ad97ea6120..09136714cfa 100644
--- a/tensorflow/core/data/service/journal.proto
+++ b/tensorflow/core/data/service/journal.proto
@@ -12,6 +12,8 @@ message Update {
     RegisterDatasetUpdate register_dataset = 1;
     RegisterWorkerUpdate register_worker = 5;
     CreateJobUpdate create_job = 2;
+    AcquireJobClientUpdate acquire_job_client = 6;
+    ReleaseJobClientUpdate release_job_client = 7;
     CreateTaskUpdate create_task = 3;
     FinishTaskUpdate finish_task = 4;
   }
@@ -39,6 +41,18 @@ message CreateJobUpdate {
   NamedJobKeyDef named_job_key = 4;
 }
 
+message AcquireJobClientUpdate {
+  int64 job_id = 1;
+  int64 job_client_id = 2;
+}
+
+message ReleaseJobClientUpdate {
+  int64 job_client_id = 1;
+  // The time when the client was released, measured in microseconds since the
+  // epoch.
+  int64 time_micros = 2;
+}
+
 message CreateTaskUpdate {
   int64 task_id = 1;
   int64 job_id = 2;
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 477f785dc84..4ee186cd9ec 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -72,6 +72,8 @@ void GrpcDataServerBase::Stop() {
   }
   server_->Shutdown();
   stopped_ = true;
+  LOG(INFO) << "Shut down " << server_type_ << " server running at port "
+            << BoundPort();
 }
 
 void GrpcDataServerBase::Join() { server_->Wait(); }
diff --git a/tensorflow/core/data/service/utils.cc b/tensorflow/core/data/service/utils.cc
new file mode 100644
index 00000000000..3045ef60642
--- /dev/null
+++ b/tensorflow/core/data/service/utils.cc
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/utils.h"
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace tensorflow {
+namespace data {
+
+Status WriteDatasetDef(const std::string& path, const DatasetDef& dataset_def) {
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(path, &file));
+  io::RecordWriter writer(file.get());
+  TF_RETURN_IF_ERROR(writer.WriteRecord(dataset_def.SerializeAsString()));
+  return Status::OK();
+}
+
+Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def) {
+  if (path.empty()) {
+    return errors::InvalidArgument("Path is empty");
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(path));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(path, &file));
+  io::RecordReader reader(file.get());
+  uint64 offset = 0;
+  tstring record;
+  TF_RETURN_IF_ERROR(reader.ReadRecord(&offset, &record));
+  if (!dataset_def.ParseFromString(record)) {
+    return errors::DataLoss("Failed to parse dataset definition");
+  }
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/utils.h b/tensorflow/core/data/service/utils.h
new file mode 100644
index 00000000000..b15e512e5bf
--- /dev/null
+++ b/tensorflow/core/data/service/utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/platform/env.h"
+
+// Utilities shared between the dispatcher and worker servers.
+namespace tensorflow {
+namespace data {
+
+// Writes a dataset definition to the specified path. If the file already
+// exists, it will be overwritten.
+Status WriteDatasetDef(const std::string& path, const DatasetDef& dataset_def);
+
+// Reads a dataset definition from specified path, and stores it in
+// `dataset_def`. Returns NOT_FOUND if the path cannot be found.
+Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_UTILS_H_
diff --git a/tensorflow/core/data/service/utils_test.cc b/tensorflow/core/data/service/utils_test.cc
new file mode 100644
index 00000000000..59b7abdc2bc
--- /dev/null
+++ b/tensorflow/core/data/service/utils_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/utils.h"
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+DatasetDef DatasetDefWithVersion(int32 version) {
+  DatasetDef def;
+  def.mutable_graph()->set_version(version);
+  return def;
+}
+}  // namespace
+
+TEST(Utils, ReadWriteDataset) {
+  std::string filename = testing::TmpDir();
+  ASSERT_TRUE(Env::Default()->CreateUniqueFileName(&filename, "journal_dir"));
+  int32 version = 3;
+  DatasetDef def = DatasetDefWithVersion(version);
+  TF_ASSERT_OK(WriteDatasetDef(filename, def));
+  DatasetDef result;
+  TF_ASSERT_OK(ReadDatasetDef(filename, result));
+  EXPECT_EQ(result.graph().version(), version);
+}
+
+TEST(Utils, OverwriteDataset) {
+  std::string filename = testing::TmpDir();
+  ASSERT_TRUE(Env::Default()->CreateUniqueFileName(&filename, "journal_dir"));
+  int32 version_1 = 1;
+  int32 version_2 = 2;
+  DatasetDef def_1 = DatasetDefWithVersion(version_1);
+  TF_ASSERT_OK(WriteDatasetDef(filename, def_1));
+  DatasetDef def_2 = DatasetDefWithVersion(version_2);
+  TF_ASSERT_OK(WriteDatasetDef(filename, def_2));
+  DatasetDef result;
+  TF_ASSERT_OK(ReadDatasetDef(filename, result));
+  EXPECT_EQ(result.graph().version(), version_2);
+}
+
+TEST(Utils, ReadDatasetNotFound) {
+  std::string filename = testing::TmpDir();
+  ASSERT_TRUE(Env::Default()->CreateUniqueFileName(&filename, "journal_dir"));
+  DatasetDef result;
+  Status s = ReadDatasetDef(filename, result);
+  EXPECT_EQ(s.code(), error::NOT_FOUND);
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index d17acffb941..cc61c481d7c 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/utils.h"
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -61,21 +63,19 @@ Status DataServiceWorkerImpl::Start(const std::string& worker_address) {
   VLOG(3) << "Starting tf.data service worker at address " << worker_address;
   worker_address_ = worker_address;
 
-  std::unique_ptr<DispatcherService::Stub> dispatcher;
-  TF_RETURN_IF_ERROR(MakeDispatcherStub(&dispatcher));
+  dispatcher_ = absl::make_unique<DataServiceDispatcherClient>(
+      config_.dispatcher_address(), config_.protocol());
+  TF_RETURN_IF_ERROR(dispatcher_->Initialize());
 
-  Status s = Register(dispatcher.get());
+  Status s = Register();
   while (!s.ok()) {
     LOG(WARNING) << "Failed to register with dispatcher at "
                  << config_.dispatcher_address() << ": " << s;
     Env::Default()->SleepForMicroseconds(kRetryIntervalMicros);
-    s = Register(dispatcher.get());
+    s = Register();
   }
-  Thread* thread =
-      Env::Default()->StartThread({}, "data-service-worker-background",
-                                  [this, dispatcher = dispatcher.release()]() {
-                                    BackgroundThread(dispatcher);
-                                  });
+  Thread* thread = Env::Default()->StartThread(
+      {}, "data-service-worker-background", [this]() { BackgroundThread(); });
   LOG(INFO) << "Worker registered with dispatcher running at "
             << config_.dispatcher_address();
   background_thread_.reset(thread);
@@ -94,27 +94,52 @@ Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
 
 Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  VLOG(3) << "Received request to process task " << task_def.task_id();
-  standalone::Dataset::Params params;
-  std::unique_ptr<standalone::Dataset> dataset;
-  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
-      params, task_def.dataset().graph(), &dataset));
-
-  std::unique_ptr<standalone::Iterator> iterator;
-  TF_RETURN_IF_ERROR(dataset->MakeIterator(&iterator));
-
-  if (tasks_.contains(task_def.task_id())) {
+  std::unique_ptr<Task>& task = tasks_[task_def.task_id()];
+  if (task) {
     return errors::AlreadyExists("A task with id ", task_def.task_id(),
                                  " already exists.");
   }
-  Task& task = tasks_[task_def.task_id()];
-  task.task_id = task_def.task_id();
-  task.dataset = std::move(dataset);
-  task.iterator = std::move(iterator);
+  task = absl::make_unique<Task>(task_def);
   VLOG(3) << "Began processing for task " << task_def.task_id();
   return Status::OK();
 }
 
+Status DataServiceWorkerImpl::EnsureTaskInitialized(
+    DataServiceWorkerImpl::Task& task) {
+  mutex_lock l(task.mu);
+  if (task.initialized) {
+    return Status::OK();
+  }
+  standalone::Dataset::Params params;
+
+  switch (task.task_def.dataset_case()) {
+    case TaskDef::kDatasetDef:
+      TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+          params, task.task_def.dataset_def().graph(), &task.dataset));
+      break;
+    case TaskDef::kPath: {
+      DatasetDef def;
+      Status s = ReadDatasetDef(task.task_def.path(), def);
+      if (!s.ok()) {
+        LOG(INFO) << "Failed to read dataset from " << task.task_def.path()
+                  << ": " << s << ". Falling back to reading from dispatcher.";
+        TF_RETURN_IF_ERROR(
+            dispatcher_->GetDatasetDef(task.task_def.dataset_id(), def));
+      }
+      TF_RETURN_IF_ERROR(
+          standalone::Dataset::FromGraph(params, def.graph(), &task.dataset));
+      break;
+    }
+    case TaskDef::DATASET_NOT_SET:
+      return errors::Internal("Unrecognized dataset case: ",
+                              task.task_def.dataset_case());
+  }
+  TF_RETURN_IF_ERROR(task.dataset->MakeIterator(&task.iterator));
+  task.initialized = true;
+  VLOG(3) << "Created iterator for task " << task.task_def.task_id();
+  return Status::OK();
+}
+
 Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
                                          GetElementResponse* response) {
   VLOG(3) << "Received GetElement request for task " << request->task_id();
@@ -134,7 +159,9 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
       return errors::NotFound("DataServiceWorkerImpl::GetElement failed. ",
                               "Task id ", request->task_id(), " not found");
     }
-    std::unique_ptr<standalone::Iterator>& iter = it->second.iterator;
+    auto& task = it->second;
+    TF_RETURN_IF_ERROR(EnsureTaskInitialized(*task));
+    std::unique_ptr<standalone::Iterator>& iter = task->iterator;
     if (iter == nullptr) {
       VLOG(3) << "Task " << request->task_id() << " is already finished";
       response->set_end_of_sequence(true);
@@ -185,30 +212,11 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::MakeDispatcherStub(
-    std::unique_ptr<DispatcherService::Stub>* stub) {
-  ::grpc::ChannelArguments args;
-  std::shared_ptr<::grpc::ChannelCredentials> credentials;
-  TF_RETURN_IF_ERROR(CredentialsFactory::CreateClientCredentials(
-      config_.protocol(), &credentials));
-  auto channel = ::grpc::CreateCustomChannel(config_.dispatcher_address(),
-                                             credentials, args);
-  *stub = DispatcherService::NewStub(channel);
-  return Status::OK();
-}
-
-Status DataServiceWorkerImpl::Register(DispatcherService::Stub* dispatcher_stub)
-    LOCKS_EXCLUDED(mu_) {
+Status DataServiceWorkerImpl::Register() LOCKS_EXCLUDED(mu_) {
   VLOG(3) << "Registering with dispatcher at " << config_.dispatcher_address();
-  RegisterWorkerRequest req;
-  req.set_worker_address(worker_address_);
-  RegisterWorkerResponse resp;
-  grpc::ClientContext ctx;
-  grpc::Status s = dispatcher_stub->RegisterWorker(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to register worker", s);
-  }
-  for (const TaskDef& task : resp.tasks()) {
+  std::vector<TaskDef> tasks;
+  TF_RETURN_IF_ERROR(dispatcher_->RegisterWorker(worker_address_, tasks));
+  for (const TaskDef& task : tasks) {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(ProcessTaskInternal(task));
   }
@@ -216,10 +224,7 @@ Status DataServiceWorkerImpl::Register(DispatcherService::Stub* dispatcher_stub)
   return Status::OK();
 }
 
-void DataServiceWorkerImpl::BackgroundThread(
-    DispatcherService::Stub* dispatcher_ptr) LOCKS_EXCLUDED(mu_) {
-  std::unique_ptr<DispatcherService::Stub> dispatcher =
-      absl::WrapUnique(dispatcher_ptr);
+void DataServiceWorkerImpl::BackgroundThread() LOCKS_EXCLUDED(mu_) {
   while (true) {
     {
       mutex_lock l(mu_);
@@ -231,40 +236,38 @@ void DataServiceWorkerImpl::BackgroundThread(
         return;
       }
     }
-    Status s = SendTaskUpdates(dispatcher.get());
+    Status s = SendTaskUpdates();
     if (!s.ok()) {
       LOG(WARNING) << "Failed to send task updates to dispatcher: " << s;
-      Env::Default()->SleepForMicroseconds(kRetryIntervalMicros);
+      mutex_lock l(mu_);
+      if (!cancelled_) {
+        background_cv_.wait_for(
+            l, std::chrono::microseconds(kRetryIntervalMicros));
+      }
     }
   }
 }
 
-Status DataServiceWorkerImpl::SendTaskUpdates(
-    DispatcherService::Stub* dispatcher) LOCKS_EXCLUDED(mu_) {
-  WorkerUpdateRequest req;
+Status DataServiceWorkerImpl::SendTaskUpdates() LOCKS_EXCLUDED(mu_) {
+  std::vector<TaskProgress> task_progress;
   {
     mutex_lock l(mu_);
     VLOG(3) << "Sending " << pending_completed_tasks_.size()
             << " task updates to dispatcher";
-    req.set_worker_address(worker_address_);
+    task_progress.reserve(pending_completed_tasks_.size());
     for (int task_id : pending_completed_tasks_) {
-      TaskProgress* update = req.add_updates();
-      update->set_task_id(task_id);
-      update->set_completed(true);
+      task_progress.emplace_back();
+      task_progress.back().set_task_id(task_id);
+      task_progress.back().set_completed(true);
     }
   }
 
-  WorkerUpdateResponse resp;
-  grpc::ClientContext ctx;
-  grpc::Status s = dispatcher->WorkerUpdate(&ctx, req, &resp);
-  if (!s.ok()) {
-    return grpc_util::WrapError("Failed to send task updates", s);
-  }
+  TF_RETURN_IF_ERROR(dispatcher_->WorkerUpdate(worker_address_, task_progress));
   mutex_lock l(mu_);
-  for (const auto& update : req.updates()) {
+  for (const auto& update : task_progress) {
     pending_completed_tasks_.erase(update.task_id());
   }
-  VLOG(3) << "Sent " << req.updates().size() << " task updates ";
+  VLOG(3) << "Sent " << task_progress.size() << " task updates ";
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 36edbe5ce74..27a7da34c1d 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/worker.pb.h"
 #include "tensorflow/core/data/standalone.h"
@@ -51,36 +52,37 @@ class DataServiceWorkerImpl {
                     GetElementResponse* response);
 
  private:
-  Status MakeDispatcherStub(std::unique_ptr<DispatcherService::Stub>* stub);
-  // Registers the worker with the dispatcher.
-  Status Register(DispatcherService::Stub* dispatcher) LOCKS_EXCLUDED(mu_);
-  // Sends task status to the dispatcher and checks for dispatcher commands.
-  Status SendTaskUpdates(DispatcherService::Stub* dispatcher)
-      LOCKS_EXCLUDED(mu_);
-  // Creates an iterator to process a task.
-  Status ProcessTaskInternal(const TaskDef& task) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  // A thread for doing async background processing not associated with a
-  // specific RPC, such as reporting finished tasks. The thread takes
-  // ownership of the passed dispatcher_ptr. We use a raw pointer instead of
-  // unique_ptr since unique_ptr cannot be passed to std::function.
-  void BackgroundThread(DispatcherService::Stub* dispatcher_ptr)
-      LOCKS_EXCLUDED(mu_);
+  struct Task {
+    explicit Task(TaskDef task_def) : task_def(std::move(task_def)) {}
 
-  typedef struct Task {
-    int64 task_id;
+    TaskDef task_def;
+    mutex mu;
+    bool initialized TF_GUARDED_BY(mu) = false;
     // TODO(aaudibert): Have standalone::Iterator own a reference to
     // standalone::Dataset so that we don't need to store the dataset here.
     std::unique_ptr<standalone::Dataset> dataset;
     std::unique_ptr<standalone::Iterator> iterator;
-  } Task;
+  };
+
+  // Registers the worker with the dispatcher.
+  Status Register() LOCKS_EXCLUDED(mu_);
+  // Sends task status to the dispatcher and checks for dispatcher commands.
+  Status SendTaskUpdates() LOCKS_EXCLUDED(mu_);
+  // Creates an iterator to process a task.
+  Status ProcessTaskInternal(const TaskDef& task) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status EnsureTaskInitialized(Task& task);
+  // A thread for doing async background processing not associated with a
+  // specific RPC, such as reporting finished tasks.
+  void BackgroundThread() LOCKS_EXCLUDED(mu_);
 
   const experimental::WorkerConfig config_;
   // The worker's own address.
   std::string worker_address_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
 
   mutex mu_;
   // Information about tasks, keyed by task ids.
-  absl::flat_hash_map<int64, Task> tasks_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64, std::unique_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
   // Completed tasks which haven't yet been communicated to the dispatcher.
   absl::flat_hash_set<int64> pending_completed_tasks_ TF_GUARDED_BY(mu_);
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 30512295a7e..94570c1b577 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -105,6 +105,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -537,6 +538,7 @@ cc_library(
         "//tensorflow/core:lib_internal",  # protobuf::Any
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index eb9ce64bcdb..4655bce44f9 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/framework/function.h"
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 46889e737e7..4215b163991 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -146,7 +146,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
                              delete cpu_tensor;
                              // This callback must not block, so execute
                              // done in another thread.
-                             RunClosure([s, done] { done(s); });
+                             work_queue_->Schedule([s, done] { done(s); });
                            });
         delete state;
         return;
@@ -185,6 +185,62 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
                                           dev_attributes_callback);
 }
 
+void CollectiveRemoteAccessDistributed::CheckPeerHealth(
+    const string& peer_task, const StatusCallback& done) {
+  if (peer_task == task_name_) {
+    // Fast path if the peer is the worker itself.
+    done(Status::OK());
+    return;
+  }
+  // We send a GetStatus RPC with fail_fast=false to check the health of a peer
+  // task. If the RPC succeeds, we verify if the peer_device incarnation matches
+  // the local record if we have it. Note that DeviceResolverInterface always
+  // caches the device attributes.
+  WorkerInterface* wi = worker_cache_->GetOrCreateWorker(peer_task);
+  if (wi == nullptr) {
+    done(errors::InvalidArgument(peer_task,
+                                 " not found. It's probably in valid. The "
+                                 "valid form is /job:xxx/replica:0/task:N"));
+    return;
+  }
+  auto req = new GetStatusRequest();
+  auto resp = new GetStatusResponse();
+  // We're not using Cancellable call because GetStatusAsync doesn't support
+  // cancellation yet.
+  wi->GetStatusAsync(
+      req, resp, /*fail_fast*/ true,
+      [this, req, resp, wi, peer_task, done](Status s) {
+        std::vector<DeviceAttributes> cached_attrs;
+        if (s.ok()) {
+          s = dev_resolver_->GetTaskCached(peer_task, &cached_attrs);
+        }
+        if (s.ok()) {
+          absl::flat_hash_set<uint64> remote_incarnations;
+          for (const DeviceAttributes& da : resp->device_attributes()) {
+            remote_incarnations.insert(da.incarnation());
+          }
+          for (const DeviceAttributes& attr : cached_attrs) {
+            if (!remote_incarnations.contains(attr.incarnation())) {
+              s = errors::FailedPrecondition(
+                  attr.name(), " with incarnation ", attr.incarnation(),
+                  " is not available. This usually means ", peer_task,
+                  " has restarted");
+              break;
+            }
+          }
+        } else if (errors::IsNotFound(s)) {
+          // Skip validating device incarnation if we don't know what the
+          // incarnation should be. The device attribute is cached after the
+          // first collective.
+          s = Status::OK();
+        }
+        delete req;
+        delete resp;
+        worker_cache_->ReleaseWorker(peer_task, wi);
+        done(s);
+      });
+}
+
 void CollectiveRemoteAccessDistributed::StartAbort(const Status& s) {
   CollectiveRemoteAccessLocal::StartAbort(s);
   cancel_mgr_.StartCancel();
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
index 7d8fcc615cb..ed4d448afd9 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -28,9 +28,11 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   CollectiveRemoteAccessDistributed(
       const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
       std::shared_ptr<UnboundedWorkQueue> work_queue,
-      WorkerCacheInterface* worker_cache, int64 step_id)
-      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, work_queue, step_id),
-        worker_cache_(worker_cache) {}
+      WorkerCacheInterface* worker_cache, int64 step_id, string task_name)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache),
+        work_queue_(std::move(work_queue)),
+        task_name_(std::move(task_name)) {}
 
   ~CollectiveRemoteAccessDistributed() override {}
 
@@ -42,11 +44,18 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
                     int dev_to_dev_stream_index,
                     const StatusCallback& done) override;
 
+  void CheckPeerHealth(const string& peer_task,
+                       const StatusCallback& done) override;
+
   void StartAbort(const Status& s) override;
 
  protected:
   WorkerCacheInterface* worker_cache_;  // Not owned
+  // Ownership of `work_queue_` is shared between `this` and
+  // `CollectiveExecutorMgr`.
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CancellationManager cancel_mgr_;
+  string task_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 2975442d988..b6975e40723 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -63,11 +63,12 @@ static int64 kStepId = 123;
 class FakeWorker : public TestWorkerInterface {
  public:
   FakeWorker(const string& name, DeviceMgr* dev_mgr,
-             DeviceResolverDistributed* dres)
+             DeviceResolverDistributed* dres, bool is_failed)
       : name_(name),
         device_mgr_(dev_mgr),
         device_resolver_(dres),
-        buf_rendezvous_(kStepId, dev_mgr) {}
+        buf_rendezvous_(kStepId, dev_mgr),
+        is_failed_(is_failed) {}
 
   // Direct access to a BufRendezvous that holds whatever the remote
   // worker is supposed to have.
@@ -76,6 +77,10 @@ class FakeWorker : public TestWorkerInterface {
   void GetStatusAsync(const GetStatusRequest* request,
                       GetStatusResponse* response, bool fail_fast,
                       StatusCallback done) override {
+    if (is_failed_) {
+      done(errors::Unavailable("peer down"));
+      return;
+    }
     std::vector<DeviceAttributes> dev_attr;
     device_mgr_->ListDeviceAttributes(&dev_attr);
     for (const auto& da : dev_attr) {
@@ -86,6 +91,10 @@ class FakeWorker : public TestWorkerInterface {
 
   void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
                     RecvBufResponse* response, StatusCallback done) override {
+    if (is_failed_) {
+      done(errors::Unavailable("peer down"));
+      return;
+    }
     opts->SetCancelCallback([this]() {
       // Within this test the call is satisfied by a process-local
       // BufRendezvous table. In real application the BufRendezvous
@@ -125,6 +134,7 @@ class FakeWorker : public TestWorkerInterface {
   DeviceMgr* device_mgr_;
   DeviceResolverDistributed* device_resolver_;
   BufRendezvous buf_rendezvous_;
+  bool is_failed_;
 };
 
 class FakeCache : public TestWorkerCache {
@@ -201,7 +211,7 @@ class CollRMADistTest : public ::testing::Test {
     // All tests simulate requests from worker 0 to worker 1.
     rma_.reset(new CollectiveRemoteAccessDistributed(
         device_mgrs_[0], dev_resolvers_[dev0_worker_name], work_queue_, &wc_,
-        kStepId));
+        kStepId, "/job:worker/replica:0/task:0"));
 
     const int kNumElts = 8;
     expected_value_ = Tensor(DT_FLOAT, {kNumElts});
@@ -215,7 +225,7 @@ class CollRMADistTest : public ::testing::Test {
   }
 
   void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices) {
+                    int num_devices, bool is_failed = false) {
     std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
@@ -232,19 +242,19 @@ class CollRMADistTest : public ::testing::Test {
     DeviceResolverDistributed* dev_res =
         new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
     dev_resolvers_[worker_name] = dev_res;
-    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
+    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res, is_failed);
     workers_.push_back(fw);
     wc_.AddWorker(worker_name, fw);
   }
 
   void RestartWorker(const string& worker_name, const string& device_type,
-                     int num_devices) {
+                     int num_devices, bool is_failed = false) {
     auto it = dev_resolvers_.find(worker_name);
     if (it != dev_resolvers_.end()) {
       delete it->second;
       dev_resolvers_.erase(it);
     }
-    DefineWorker(worker_name, device_type, num_devices);
+    DefineWorker(worker_name, device_type, num_devices, is_failed);
   }
 
   void ValidateResultTensor() {
@@ -401,7 +411,7 @@ TEST_F(CollRMADistTest, WorkerRestart) {
   ValidateResultTensor();
 
   // Restart task 1 and check that recv from task 1 to task 0 fails.
-  RestartWorker("/job:worker/replica:0/task:1", "CPU", 1);
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
   Notification post_restart_note;
   rma_->RecvFromPeer(
       "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
@@ -417,5 +427,139 @@ TEST_F(CollRMADistTest, WorkerRestart) {
   EXPECT_TRUE(errors::IsFailedPrecondition(consumer_status));
 }
 
+TEST_F(CollRMADistTest, CheckHealthOKWithCachedAttr) {
+  DeviceAttributes attr;
+  Status get_attr_status;
+  Notification get_attr_done;
+  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
+  // worker.
+  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
+      "/job:worker/replica:0/task:1/device:CPU:0",
+      "/job:worker/replica:0/task:1", &attr,
+      [&get_attr_status, &get_attr_done](const Status& s) {
+        get_attr_status = s;
+        get_attr_done.Notify();
+      });
+  get_attr_done.WaitForNotification();
+  TF_ASSERT_OK(get_attr_status);
+
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  TF_EXPECT_OK(check_health_status);
+}
+
+TEST_F(CollRMADistTest, CheckHealthOKWithoutCachedAttr) {
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(check_health_status.ok());
+}
+
+TEST_F(CollRMADistTest, CheckHealthRestarted) {
+  DeviceAttributes attr;
+  Status get_attr_status;
+  Notification get_attr_done;
+  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
+  // worker.
+  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
+      "/job:worker/replica:0/task:1/device:CPU:0",
+      "/job:worker/replica:0/task:1", &attr,
+      [&get_attr_status, &get_attr_done](const Status& s) {
+        get_attr_status = s;
+        get_attr_done.Notify();
+      });
+  get_attr_done.WaitForNotification();
+  TF_ASSERT_OK(get_attr_status);
+
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
+
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(errors::IsFailedPrecondition(check_health_status));
+}
+
+TEST_F(CollRMADistTest, CheckHealthFailedPeer) {
+  DeviceAttributes attr;
+  Status get_attr_status;
+  Notification get_attr_done;
+  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
+  // worker.
+  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
+      "/job:worker/replica:0/task:1/device:CPU:0",
+      "/job:worker/replica:0/task:1", &attr,
+      [&get_attr_status, &get_attr_done](const Status& s) {
+        get_attr_status = s;
+        get_attr_done.Notify();
+      });
+  get_attr_done.WaitForNotification();
+  TF_ASSERT_OK(get_attr_status);
+
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1,
+                /*is_failed*/ true);
+
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(errors::IsUnavailable(check_health_status));
+}
+
+TEST_F(CollRMADistTest, CheckHealthRestartedWithDifferentDevices) {
+  RestartWorker("/job:worker/replica:0/task:1", "GPU", /*num_devices*/ 1);
+
+  DeviceAttributes attr;
+  Status get_attr_status;
+  Notification get_attr_done;
+  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
+  // worker.
+  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
+      "/job:worker/replica:0/task:1/device:GPU:0",
+      "/job:worker/replica:0/task:1", &attr,
+      [&get_attr_status, &get_attr_done](const Status& s) {
+        get_attr_status = s;
+        get_attr_done.Notify();
+      });
+  get_attr_done.WaitForNotification();
+  TF_ASSERT_OK(get_attr_status);
+
+  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
+
+  Status check_health_status;
+  Notification check_health_done;
+  rma_->CheckPeerHealth(
+      "/job:worker/replica:0/task:1",
+      [&check_health_status, &check_health_done](const Status s) {
+        check_health_status = s;
+        check_health_done.Notify();
+      });
+  check_health_done.WaitForNotification();
+  EXPECT_TRUE(errors::IsFailedPrecondition(check_health_status));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
index 927925c0e21..ab0b3a60600 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -113,6 +113,22 @@ void DeviceResolverDistributed::RefreshRemoteAttributes(
       });
 }
 
+Status DeviceResolverDistributed::GetTaskCached(
+    const string& task, std::vector<DeviceAttributes>* attributes) {
+  mutex_lock l(mu_);
+  attributes->clear();
+  for (const auto& it : attr_table_) {
+    const string& device_name = it.first;
+    if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) {
+      attributes->push_back(it.second);
+    }
+  }
+  if (attributes->empty()) {
+    return errors::NotFound(task, " not found in the cache");
+  }
+  return Status::OK();
+}
+
 void DeviceResolverDistributed::ClearTask(const string& task) {
   mutex_lock l(mu_);
   // First find all the keys belonging to the task.
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
index 93d51a52fef..d400fb5750e 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -43,6 +43,9 @@ class DeviceResolverDistributed : public DeviceResolverInterface {
                                 DeviceAttributes* attributes,
                                 const StatusCallback& done) override;
 
+  Status GetTaskCached(const string& task,
+                       std::vector<DeviceAttributes>* attributes) override;
+
   void ClearTask(const string& task) override;
 
   void ClearCache() override;
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index c27758cbb44..fb9808b80cf 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -44,6 +44,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:worker_session",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
     ],
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 03944e12590..e9801d65b49 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -96,14 +96,16 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
           .ToProto();
   StripDefaultAttributesInRegisterFunctionOp(register_function);
 
+  const absl::optional<std::vector<int>>& ret_indices = options.ret_indices;
   eager_client->EnqueueAsync(
-      request.get(), response.get(),
+      /*call_opts=*/nullptr, request.get(), response.get(),
       [this, request, response, handle, released_op = released_op.release(),
-       target, eager_client = eager_client.get(), done](const Status& s) {
+       target, ret_indices, eager_client = eager_client.get(),
+       done](const Status& s) {
         {
           mutex_lock l(mu_);
           *handle = function_data_.size();
-          function_data_.emplace_back(target, eager_client,
+          function_data_.emplace_back(target, ret_indices, eager_client,
                                       absl::WrapUnique(released_op));
         }
         done(s);
@@ -168,6 +170,12 @@ void EagerClusterFunctionLibraryRuntime::Run(
   request->set_context_id(context_id_);
   eager::Operation* remote_op = request->mutable_operation();
 
+  if (function_data->ret_indices.has_value()) {
+    for (const int ret_index : function_data->ret_indices.value()) {
+      request->add_output_num(ret_index);
+    }
+  }
+
   for (const auto& arg : args) {
     if (arg.index() == 0) {
       absl::get<Tensor>(arg).AsProtoTensorContent(
@@ -270,7 +278,7 @@ void EagerClusterFunctionLibraryRuntime::CleanUp(
   // CleanUp() needs to be non-blocking since it would be invoked inside the
   // enqueue done callback of Run(). So we don't use StreamingEnqueueAsync here.
   eager_client->EnqueueAsync(
-      request.get(), response.get(),
+      /*call_opts=*/nullptr, request.get(), response.get(),
       [request, response, done](const Status& status) { done(status); });
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index 6e60ee0b13d..01e864053d1 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -84,12 +85,15 @@ class EagerClusterFunctionLibraryRuntime
 
   struct FunctionData {
     const string target;
+    const absl::optional<std::vector<int>> ret_indices;
     core::RefCountPtr<EagerClient> eager_client;
     std::unique_ptr<EagerOperation> op;
 
-    FunctionData(const string& target, EagerClient* eager_client,
-                 std::unique_ptr<EagerOperation> op)
+    FunctionData(const string& target,
+                 const absl::optional<std::vector<int>>& ret_indices,
+                 EagerClient* eager_client, std::unique_ptr<EagerOperation> op)
         : target(target),
+          ret_indices(ret_indices),
           eager_client(core::RefCountPtr<EagerClient>(eager_client)),
           op(std::move(op)) {
       eager_client->Ref();
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index a2ea5f615bd..0df62862d3c 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -47,7 +47,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
     // well. We don't want this request poison following requests since it is
     // safe to ignore a failing destroy tensor handle request.
     eager_client_->EnqueueAsync(
-        request_.get(), response,
+        /*call_opts=*/nullptr, request_.get(), response,
         [response, ready, done](const tensorflow::Status& s) {
           // Omit the warning if:
           // 1. The remote tensor isn't ready.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index d6cf0943176..22ff6eeb94b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -37,16 +37,21 @@ class EagerClient : public core::RefCounted {
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
-  CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 
 #undef CLIENT_METHOD
 
-  virtual void RunComponentFunctionAsync(
-      CallOptions* call_opts, const RunComponentFunctionRequest* request,
-      RunComponentFunctionResponse* response, StatusCallback done) = 0;
+#define CLIENT_CANCELABLE_METHOD(method)                      \
+  virtual void method##Async(                                 \
+      CallOptions* call_opts, const method##Request* request, \
+      method##Response* response, StatusCallback done) = 0;
+
+  CLIENT_CANCELABLE_METHOD(Enqueue);
+  CLIENT_CANCELABLE_METHOD(RunComponentFunction);
+
+#undef CLIENT_CANCELABLE_METHOD
 
   // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
   // `response` will be filled with the response for this `request`. The
@@ -59,7 +64,8 @@ class EagerClient : public core::RefCounted {
   // is invoked and keeps it open until some error condition.
   // Similarly to the methods above, the request can be deleted as soon as
   // StreamingEnqueueAsync returns.
-  virtual void StreamingEnqueueAsync(const EnqueueRequest* request,
+  virtual void StreamingEnqueueAsync(CallOptions* call_opts,
+                                     const EnqueueRequest* request,
                                      EnqueueResponse* response,
                                      StatusCallback done) = 0;
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 7b4d3d14018..c3ed312428b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -156,17 +156,25 @@ Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
   const tensorflow::Tensor* t = nullptr;
 
   // TODO(nareshmodi): This call makes async calls sync calls. Fix this.
-  TF_RETURN_IF_ERROR(handle->Tensor(&t));
+  if (handle->Type() == TensorHandle::LOCAL) {
+    TF_RETURN_IF_ERROR(handle->Tensor(&t));
 
-  t->shape().AsProto(proto);
+    t->shape().AsProto(proto);
+  } else {
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(handle->Shape(&shape));
+    shape.AsProto(proto);
+  }
 
   return Status::OK();
 }
 
 Status AddOpRetvalsToResponse(
     EagerContext* eager_context, int op_id, int num_retvals,
-    TensorHandle** retvals, std::function<TensorProto*()> add_tensor_proto_fn,
-    std::function<TensorShapeProto*()> add_shape_proto_fn) {
+    const std::vector<int32>& output_nums, TensorHandle** retvals,
+    std::function<TensorProto*()> add_tensor_proto_fn,
+    std::function<TensorShapeProto*()> add_shape_proto_fn,
+    std::function<string*()> add_device_fn = nullptr) {
   if (op_id == kInvalidRemoteOpId) {
     // Copy the output tensors back along with the response, since the op id
     // is invalid which cannot be added to RemoteMgr.
@@ -175,10 +183,23 @@ Status AddOpRetvalsToResponse(
       retvals[i]->Unref();
     }
   } else {
-    eager_context->RemoteMgr()->AddOperationOutputs(
-        absl::MakeSpan(retvals, num_retvals), op_id);
     for (int i = 0; i < num_retvals; i++) {
       TF_RETURN_IF_ERROR(TensorHandleShape(retvals[i], add_shape_proto_fn()));
+      const bool is_remote = retvals[i]->Type() == TensorHandle::REMOTE;
+      if (add_device_fn) {
+        *add_device_fn() =
+            is_remote ? absl::get<Device*>(
+                            retvals[i]->DeviceOrHostCPU(*eager_context))
+                            ->name()
+                      : "";
+      }
+      if (is_remote) {
+        retvals[i]->Unref();
+      } else {
+        const int output_num = output_nums.empty() ? i : output_nums.at(i);
+        eager_context->RemoteMgr()->AddOperationOutput(retvals[i], op_id,
+                                                       output_num);
+      }
     }
   }
   return Status::OK();
@@ -456,6 +477,10 @@ void EagerServiceImpl::RunComponentFunction(
   auto* retvals = new absl::FixedArray<TensorHandle*>(*num_retvals);
   VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
           << operation.id();
+  std::vector<int32> output_nums;
+  for (const int32 output_num : request->output_num()) {
+    output_nums.push_back(output_num);
+  }
 
   auto cm = std::make_shared<CancellationManager>();
   op->SetCancellationManager(cm.get());
@@ -464,8 +489,8 @@ void EagerServiceImpl::RunComponentFunction(
   context->Ref();
   EagerLocalExecuteAsync(
       op, retvals->data(), num_retvals,
-      [op, op_id = operation.id(), num_retvals, retvals, cm, call_opts,
-       response, eager_context, context,
+      [op, op_id = operation.id(), num_retvals, retvals, output_nums, cm,
+       call_opts, response, eager_context, context,
        done = std::move(done)](const Status& status) {
         call_opts->ClearCancelCallback();
         auto wrapped_done = [&](const Status& status) {
@@ -479,14 +504,17 @@ void EagerServiceImpl::RunComponentFunction(
           wrapped_done(status);
           return;
         }
+        // The output device of a component function is the component device
+        // which is known on the default device of it's parent function.
         wrapped_done(AddOpRetvalsToResponse(
-            eager_context, op_id, *num_retvals, retvals->data(),
+            eager_context, op_id, *num_retvals, output_nums, retvals->data(),
             [response] { return response->add_tensor(); },
             [response] { return response->add_shape(); }));
       });
 }
 
-Status EagerServiceImpl::ExecuteOp(const Operation& operation,
+Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
+                                   const Operation& operation,
                                    EagerContext* eager_context,
                                    EagerExecutor* eager_executor,
                                    QueueResponse* queue_response) {
@@ -495,6 +523,12 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   TF_RETURN_IF_ERROR(GetEagerOperationAndNumRetvals(
       operation, eager_context, eager_executor, &op, &num_retvals));
 
+  auto cm = std::make_shared<CancellationManager>();
+  if (call_opts) {
+    op.SetCancellationManager(cm.get());
+    call_opts->SetCancelCallback([cm] { cm->StartCancel(); });
+  }
+
   absl::FixedArray<tensorflow::TensorHandle*> retvals(num_retvals);
   VLOG(3) << "ServerContext: Calling EagerExecute for op " << operation.id();
   TF_RETURN_IF_ERROR(op.Execute(
@@ -503,13 +537,23 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
           num_retvals),
       &num_retvals));
 
+  std::function<string*()> add_device_fn = nullptr;
+  // Send the output devices of a function back to let a client know where the
+  // outputs are. For a primitive op, an output devics is the op device which is
+  // known on a client.
+  if (op.is_function()) {
+    add_device_fn = [queue_response] { return queue_response->add_device(); };
+  }
+
   return AddOpRetvalsToResponse(
-      eager_context, operation.id(), num_retvals, retvals.data(),
-      [queue_response] { return queue_response->add_tensor(); },
-      [queue_response] { return queue_response->add_shape(); });
+      eager_context, operation.id(), num_retvals, /*output_nums=*/{},
+      retvals.data(), [queue_response] { return queue_response->add_tensor(); },
+      [queue_response] { return queue_response->add_shape(); },
+      std::move(add_device_fn));
 }
 
-Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
+Status EagerServiceImpl::Enqueue(CallOptions* call_opts,
+                                 const EnqueueRequest* request,
                                  EnqueueResponse* response, uint64 stream_id) {
   profiler::TraceMe activity(
       [&] {
@@ -530,7 +574,7 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
   for (const auto& item : request->queue()) {
     auto* queue_response = response->add_queue_response();
     if (item.has_operation()) {
-      s = ExecuteOp(item.operation(), context->Context(), &executor,
+      s = ExecuteOp(call_opts, item.operation(), context->Context(), &executor,
                     queue_response);
     } else if (item.has_handle_to_decref()) {
       auto handle_to_decref = absl::make_unique<RemoteTensorHandleInternal>(
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index e8b4e1e5090..f769bde537c 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -90,7 +90,8 @@ class EagerServiceImpl {
   static constexpr uint64 kInvalidStreamId = 0;
 
   // Used by both Enqueue and StreamingEnqueue RPCs.
-  Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response,
+  Status Enqueue(CallOptions* call_opts, const EnqueueRequest* request,
+                 EnqueueResponse* response,
                  uint64 stream_id = kInvalidStreamId);
 
   Status WaitQueueDone(const WaitQueueDoneRequest* request,
@@ -207,8 +208,8 @@ class EagerServiceImpl {
   };
 
  private:
-  Status ExecuteOp(const Operation& operation, EagerContext* eager_context,
-                   EagerExecutor* eager_executor,
+  Status ExecuteOp(CallOptions* call_opts, const Operation& operation,
+                   EagerContext* eager_context, EagerExecutor* eager_executor,
                    QueueResponse* queue_response);
   Status SendTensor(const SendTensorOp& send_tensor,
                     EagerContext* eager_context);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 76fc12d1adc..700cea117de 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -81,12 +82,16 @@ class FakeEagerClient : public EagerClient {
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
-  CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 #undef CLIENT_METHOD
 
+  void EnqueueAsync(CallOptions* call_opts, const EnqueueRequest* request,
+                    EnqueueResponse* response, StatusCallback done) override {
+    done(impl_->Enqueue(call_opts, request, response));
+  }
+
   void RunComponentFunctionAsync(CallOptions* call_opts,
                                  const RunComponentFunctionRequest* request,
                                  RunComponentFunctionResponse* response,
@@ -94,10 +99,11 @@ class FakeEagerClient : public EagerClient {
     impl_->RunComponentFunction(call_opts, request, response, std::move(done));
   }
 
-  void StreamingEnqueueAsync(const EnqueueRequest* request,
+  void StreamingEnqueueAsync(CallOptions* call_opts,
+                             const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
-    done(impl_->Enqueue(request, response));
+    done(impl_->Enqueue(nullptr, request, response));
   }
 
   bool allow_multiple_pending_requests() const override { return false; }
@@ -218,10 +224,11 @@ void AddOperationToRunComponentFunctionRequest(
     const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
         inputs,
     const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    RunComponentFunctionRequest* request) {
+    const int output_num, RunComponentFunctionRequest* request) {
   auto* operation = request->mutable_operation();
   operation->set_is_function(true);
   operation->set_is_component_function(true);
+  request->add_output_num(output_num);
   BuildOperation(operation, id, name, inputs, attrs, device);
 }
 
@@ -421,7 +428,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
       2, "MatMul", {std::make_pair(1, 0), std::make_pair(1, 0)}, attrs,
       "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
 
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
 
   auto& matmul_result_shape =
@@ -462,7 +469,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
   // Creates a context and attempts to execute a function.
   void TestFunction(const RegisterFunctionOp& register_op,
                     const string& function_name,
-                    const bool local_inputs = false) {
+                    const bool local_inputs = false,
+                    const bool test_cancel = false) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
 
     uint64 context_id = random::New64();
@@ -480,8 +488,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     *enqueue_request.add_queue()->mutable_register_function() = register_op;
     EnqueueResponse enqueue_response;
 
-    TF_ASSERT_OK(
-        eager_service_impl.Enqueue(&enqueue_request, &enqueue_response));
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &enqueue_request,
+                                            &enqueue_response));
 
     EnqueueRequest remote_enqueue_request;
     remote_enqueue_request.set_context_id(context_id);
@@ -517,22 +525,38 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
           &remote_enqueue_request);
     }
 
-    TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
-                                            &remote_enqueue_response));
+    CallOptions call_opts;
+    Status status;
+    Notification n;
+    Env::Default()->SchedClosure([&] {
+      status = eager_service_impl.Enqueue(&call_opts, &remote_enqueue_request,
+                                          &remote_enqueue_response);
+      n.Notify();
+    });
 
-    const tensorflow::Tensor* t = nullptr;
-    tensorflow::TensorHandle* tensor_handle;
-    TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
-        context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
-    TF_ASSERT_OK(tensor_handle->Tensor(&t));
+    if (test_cancel) {
+      // Wait to let the Enqueue thread starts running
+      Env::Default()->SleepForMicroseconds(500000);
+      call_opts.StartCancel();
+      n.WaitForNotification();
+      EXPECT_TRUE(errors::IsCancelled(status)) << status.error_message();
+    } else {
+      n.WaitForNotification();
+      TF_ASSERT_OK(status);
+      const tensorflow::Tensor* t = nullptr;
+      tensorflow::TensorHandle* tensor_handle;
+      TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+          context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+      TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
-    auto actual = t->flat<float>();
-    EXPECT_EQ(4, actual.size());
+      auto actual = t->flat<float>();
+      EXPECT_EQ(4, actual.size());
 
-    EXPECT_EQ(7, actual(0));
-    EXPECT_EQ(10, actual(1));
-    EXPECT_EQ(15, actual(2));
-    EXPECT_EQ(22, actual(3));
+      EXPECT_EQ(7, actual(0));
+      EXPECT_EQ(10, actual(1));
+      EXPECT_EQ(15, actual(2));
+      EXPECT_EQ(22, actual(3));
+    }
 
     CloseContextRequest close_context_request;
     close_context_request.set_context_id(context_id);
@@ -562,8 +586,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     enqueue_request.set_context_id(context_id);
     *enqueue_request.add_queue()->mutable_register_function() = register_op;
     EnqueueResponse enqueue_response;
-    TF_ASSERT_OK(
-        eager_service_impl.Enqueue(&enqueue_request, &enqueue_response));
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &enqueue_request,
+                                            &enqueue_response));
 
     // First run an op to generate input for function.
     EnqueueRequest remote_enqueue_request;
@@ -580,17 +604,19 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
                                  "/job:localhost/replica:0/task:0/device:CPU:0",
                                  &remote_enqueue_request);
-    TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                             &remote_enqueue_response));
 
     // Run function with input from the previous op.
     RunComponentFunctionRequest run_comp_func_request;
     run_comp_func_request.set_context_id(context_id);
     RunComponentFunctionResponse run_comp_func_response;
+    const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         2, function_name, {std::make_pair(1, 0)},
         std::unordered_map<string, AttrValue>(),
-        "/job:localhost/replica:0/task:0/device:CPU:0", &run_comp_func_request);
+        "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
+        &run_comp_func_request);
 
     CallOptions call_opts;
     Notification n;
@@ -613,7 +639,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
       const tensorflow::Tensor* t = nullptr;
       tensorflow::TensorHandle* tensor_handle;
       TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
-          context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+          context_id, RemoteTensorHandleInternal(2, output_num),
+          &tensor_handle));
       TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
       auto actual = t->flat<float>();
@@ -653,6 +680,13 @@ TEST_F(EagerServiceImplFunctionTest, NestedFunctionTest) {
   TestFunction(register_op, "MatMulNestedFunction");
 }
 
+TEST_F(EagerServiceImplFunctionTest, FunctionCancellationTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = SingleRecvNodeFunction();
+  TestFunction(register_op, "SingleRecvNodeFunction", /*local_inputs=*/false,
+               /*test_cancel=*/true);
+}
+
 TEST_F(EagerServiceImplFunctionTest, ComponentFunctionTest) {
   RegisterFunctionOp register_op;
   *register_op.mutable_function_def() = MatMulFunction();
@@ -735,7 +769,7 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
     const_attrs.insert({"value", val});
     AddOperationToEnqueueRequest(1, "Const", {}, const_attrs, local_device_,
                                  &remote_enqueue_request);
-    TF_EXPECT_OK(eager_service_impl_.Enqueue(&remote_enqueue_request,
+    TF_EXPECT_OK(eager_service_impl_.Enqueue(nullptr, &remote_enqueue_request,
                                              &remote_enqueue_response));
     eager_cluster_flr_ = absl::make_unique<EagerClusterFunctionLibraryRuntime>(
         context_id_, ctx, device_mgr_.get());
@@ -1041,7 +1075,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
       2, "MatMul", {std::make_pair(1, 0), std::make_pair(1, 0)}, attrs,
       "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
 
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
 
   const tensorflow::Tensor* t = nullptr;
@@ -1123,7 +1157,7 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   remote_handle->set_op_device(device2);
   remote_handle->set_device(device2);
 
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
 
   tensorflow::TensorHandle* packed_handle;
@@ -1210,7 +1244,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   SetTensorProto(send_tensor->add_tensors());
 
   // Unable to handle the request since there is no eager context.
-  Status status = eager_service_impl.Enqueue(&remote_enqueue_request,
+  Status status = eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                              &remote_enqueue_response);
   EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
   EXPECT_TRUE(absl::StrContains(
@@ -1220,7 +1254,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   // The request can be handled after adding the master eager context to
   // service.
   TF_ASSERT_OK(eager_service_impl.CreateMasterContext(context_id, ctx));
-  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
                                           &remote_enqueue_response));
   ctx->Unref();
 }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index a1d0e09faf9..f673d2ce6f4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -174,7 +174,8 @@ void RemoteCopyNode::StartSend() {
     // If StartRecv fails very quickly, `this` can be destroyed before the
     // callback below is executed. So, we can't capture `this`.
     eager_client->StreamingEnqueueAsync(
-        &request, response, [response, captured_state](const Status& s) {
+        /*call_opts=*/nullptr, &request, response,
+        [response, captured_state](const Status& s) {
           captured_state->SetSendStatus(s);
           if (!s.ok()) {
             captured_state->recv_cancellation()->StartCancel();
@@ -241,7 +242,7 @@ void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   Device* recv_device = recv_device_;
   eager_client->StreamingEnqueueAsync(
-      &request, response,
+      /*call_opts=*/nullptr, &request, response,
       [captured_state, response, recv_device, context_view_id,
        done](const Status& s) {
         if (s.ok()) {
@@ -387,7 +388,7 @@ void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
   Device* recv_device = recv_device_;
   const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
   eager_client->StreamingEnqueueAsync(
-      &request, response,
+      /*call_opts=*/nullptr, &request, response,
       [captured_state, response, recv_device, context_view_id,
        done](const Status& s) {
         if (s.ok()) {
@@ -441,7 +442,7 @@ void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   captured_state->SetSrcShape(tensor.shape());
   Device* recv_device = recv_device_;
   eager_client->StreamingEnqueueAsync(
-      &request, response,
+      /*call_opts=*/nullptr, &request, response,
       [captured_state, response, recv_device, context_view_id,
        done](const Status& s) {
         if (s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
index 067e26a31e4..e2bc73b479f 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace eager {
 
 void RemoteExecuteNode::RunAsync(StatusCallback done) {
-  EnqueueResponse* response = new EnqueueResponse;
+  auto response = std::make_shared<EnqueueResponse>();
 
   const gtl::InlinedVector<TensorHandle*, 4>& inputs = inputs_;
   const gtl::InlinedVector<TensorHandle*, 2>& retvals = retvals_;
@@ -49,6 +49,23 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
   }
   VLOG(3) << "Issuing: " << rpc_description;
 
+  CancellationManager* cm = cancellation_manager_;
+  CancellationToken token = 0;
+  auto call_opts = std::make_shared<CallOptions>();
+  if (cm != nullptr) {
+    token = cm->get_cancellation_token();
+    const bool already_cancelled = !cm->RegisterCallback(
+        token, [call_opts, response, done]() { call_opts->StartCancel(); });
+    if (already_cancelled) {
+      Status s = errors::Cancelled("RemoteExecuteNode::RunAsync");
+      for (size_t i = 0; i < retvals.size(); ++i) {
+        retvals[i]->PoisonRemote(s, device, context_view_id_);
+      }
+      done(s);
+      return;
+    }
+  }
+
   for (auto handle : inputs_) {
     handle->Ref();
   }
@@ -57,9 +74,13 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
   }
 
   eager_client_->StreamingEnqueueAsync(
-      request_.get(), response,
-      [inputs, retvals, response, device, context_view_id = context_view_id_,
-       rpc_description, done](const Status& status) {
+      call_opts.get(), request_.get(), response.get(),
+      [inputs, retvals, call_opts, response, device,
+       context_view_id = context_view_id_, rpc_description, cm, token,
+       done](const Status& status) {
+        if (cm != nullptr) {
+          cm->TryDeregisterCallback(token);
+        }
         for (auto handle : inputs) {
           handle->Unref();
         }
@@ -71,8 +92,14 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
         }
         for (size_t i = 0; i < retvals.size(); ++i) {
           if (status.ok()) {
-            Status s = retvals[i]->SetRemoteShape(
-                response->queue_response(0).shape(i), device, context_view_id);
+            const string output_device =
+                response->queue_response(0).device().empty()
+                    ? ""
+                    : response->queue_response(0).device(i);
+            Status s = retvals[i]->SetRemoteShapeAndDevice(
+                response->queue_response(0).shape(i), device, context_view_id,
+                output_device);
+
             if (!s.ok()) {
               LOG(ERROR) << "Ignoring an error encountered when setting "
                             "remote shape of tensor handle: "
@@ -88,7 +115,6 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
           retvals[i]->Unref();
         }
         done(status);
-        delete response;
       });
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index ed9f9c0ee0f..c5e31ed173e 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/shape_inference.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -39,6 +40,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
   RemoteExecuteNode(EagerContext* eager_context,
                     std::unique_ptr<EnqueueRequest> request, Device* device,
                     uint64 context_view_id, EagerClient* eager_client,
+                    CancellationManager* cancellation_manager,
                     const NodeDef& ndef, FunctionLibraryDefinition* lib_def,
                     const gtl::InlinedVector<TensorHandle*, 4>& inputs,
                     absl::Span<TensorHandle*> retvals)
@@ -48,6 +50,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
         device_(device),
         context_view_id_(context_view_id),
         eager_client_(eager_client),
+        cancellation_manager_(cancellation_manager),
         ndef_(ndef),
         lib_def_(lib_def),
         inputs_(inputs) {
@@ -125,6 +128,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
   uint64 context_view_id_;
   bool needs_remote_inputs_;
   EagerClient* eager_client_;  // Not owned, and must outlive this node.
+  CancellationManager* cancellation_manager_;
   const NodeDef ndef_;
   const FunctionLibraryDefinition* lib_def_;
   gtl::InlinedVector<TensorHandle*, 4> inputs_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index e755cd247a6..7a3a447042e 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -35,6 +35,13 @@ void RemoteMgr::AddOperationOutputs(
   }
 }
 
+void RemoteMgr::AddOperationOutput(tensorflow::TensorHandle* handle,
+                                   int64 operation_id, int32 output_num) {
+  mutex_lock l(remote_tensor_handle_mu_);
+  remote_tensor_handle_map_.emplace(
+      RemoteTensorHandleInternal(operation_id, output_num), handle);
+}
+
 Status RemoteMgr::GetTensorHandleImpl(
     const RemoteTensorHandleInternal& remote_handle,
     tensorflow::TensorHandle** handle) {
@@ -160,13 +167,14 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
     (*out)->Ref();
   } else {
     // Create a remote TensorHandle for remote tensors which have not been
-    // copied to the local worker yet.
+    // copied to the local worker yet (e.g. remote function inputs).
     const string& device_name =
         in.op_device().empty() ? in.device() : in.op_device();
     TF_RETURN_IF_ERROR(
         parent_->FindDeviceFromName(device_name.c_str(), &device));
     *out = TensorHandle::CreateLazyRemoteHandle(in.op_id(), in.output_num(),
-                                                in.dtype(), device, parent_);
+                                                in.dtype(), device,
+                                                /*is_ready=*/true, parent_);
     std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
     if (!GetMirroredResourceShape(RemoteTensorHandleInternal(in),
                                   &dtypes_and_shapes)
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 2446352c931..0b6e23c4f6b 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -47,6 +47,9 @@ class RemoteMgr {
       const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
       int64 operation_id);
 
+  void AddOperationOutput(tensorflow::TensorHandle* handles, int64 operation_id,
+                          int32 output_num);
+
   Status GetTensorHandle(const RemoteTensorHandleInternal& remote_handle,
                          tensorflow::TensorHandle** handle);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index 1e33a9d0f62..9544906471f 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -95,7 +95,7 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
   const uint64 op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
-      op_id, output_num, DT_FLOAT, remote_device_, ctx_);
+      op_id, output_num, DT_FLOAT, remote_device_, /*is_ready=*/true, ctx_);
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
       handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 6f4d5ada759..736a5d0bfd7 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -85,8 +85,9 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
 }  // namespace
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64 op_id, int output_num,
-                                               uint64 context_view_id)
-    : is_ready_(true),
+                                               uint64 context_view_id,
+                                               bool is_ready)
+    : is_ready_(is_ready),
       op_id_(op_id),
       output_num_(output_num),
       context_view_id_(context_view_id),
@@ -173,6 +174,11 @@ Status RemoteTensorHandleData::IsPoisoned() const {
 }
 
 Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
+  return SetShapeAndRemoteTask(shape, /*remote_task=*/"");
+}
+
+Status RemoteTensorHandleData::SetShapeAndRemoteTask(
+    const TensorShape& shape, const string& remote_task) {
   // If `is_ready_` is set previously due to poisoning, return the original
   // error that poisoned this tensor.
   TF_RETURN_IF_ERROR(IsPoisoned());
@@ -183,6 +189,9 @@ Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
   }
 
   shape_ = shape;
+  if (!remote_task.empty()) {
+    remote_task_ = remote_task;
+  }
   is_poisoned_ = Status::OK();
   is_ready_ = true;
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 5f096677225..7572bf8eac8 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -26,11 +26,16 @@ namespace tensorflow {
 class RemoteTensorHandleData {
  public:
   // Constructor for lazy remote handles. A lazy remote handle is created on
-  // a remote worker with an op_id and an output_num sent by a client. The
-  // client won't serialize them until the corresponding remote tensor is ready.
-  // So the remote tensor should be ready when we create a lazy remote handle.
-  RemoteTensorHandleData(int64 op_id, int output_num, uint64 context_view_id);
-  // Constructor for unshaped remote handles
+  // a remote worker with an op_id and an output_num. It doesn't control the
+  // lifetime of a remote handle that it refers to. If it refers to a remote
+  // function input, it's sent by a client which won't serialize it until
+  // the corresponding remote tensor is ready. So the remote tensor should be
+  // ready when we create a lazy remote handle. If it refers to a remote output,
+  // it's not ready until the shape is set.
+  RemoteTensorHandleData(int64 op_id, int output_num, uint64 context_view_id,
+                         bool is_ready);
+  // Constructor for unshaped remote handles. It controls the lifetime of a
+  // remote handel that it refers to.
   RemoteTensorHandleData(int64 op_id, int output_num, const string& remote_task,
                          EagerContext* ctx);
   ~RemoteTensorHandleData();
@@ -44,7 +49,10 @@ class RemoteTensorHandleData {
   Status Unprotect() { return Status::OK(); }
 
   bool IsReady() const;
+  Status WaitReady(const char* caller) const;
   Status SetShape(const TensorShape& shape);
+  Status SetShapeAndRemoteTask(const TensorShape& shape,
+                               const string& remote_task);
   void Poison(Status status);
   Status IsPoisoned() const;
 
@@ -58,8 +66,6 @@ class RemoteTensorHandleData {
   uint64 context_view_id() const { return context_view_id_; }
 
  private:
-  Status WaitReady(const char* caller) const;
-
   mutable mutex mu_;
   bool is_ready_ TF_GUARDED_BY(mu_);
   Status is_poisoned_ TF_GUARDED_BY(mu_);
@@ -68,7 +74,7 @@ class RemoteTensorHandleData {
   // IDs required when this class is representing a remote tensor handle.
   const int64 op_id_;
   const int32 output_num_;
-  string remote_task_;
+  string remote_task_ TF_GUARDED_BY(mu_);
   uint64 context_id_;
   uint64 context_view_id_;
   EagerContext* ctx_;
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 0faf8c1437a..ce8a9635e5c 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -134,12 +134,27 @@ class GrpcEagerClient : public EagerClient {
 
   CLIENT_METHOD(CreateContext);
   CLIENT_METHOD(UpdateContext);
-  CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
   CLIENT_METHOD(KeepAlive);
 
 #undef CLIENT_METHOD
 
+#define CLIENT_CANCELABLE_METHOD(method)                                      \
+  void method##Async(CallOptions* call_opts, const method##Request* request,  \
+                     method##Response* response, StatusCallback done)         \
+      override {                                                              \
+    StatusCallback done_wrapped = callback_wrapper(std::move(done));          \
+    new RPCState<protobuf::Message>(                                          \
+        &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request,     \
+        response, std::move(done_wrapped), call_opts, /*threadpool=*/nullptr, \
+        /*max_retries=*/0, /*fail_fast=*/true, &target_);                     \
+  }
+
+  CLIENT_CANCELABLE_METHOD(Enqueue);
+  CLIENT_CANCELABLE_METHOD(RunComponentFunction);
+
+#undef CLIENT_CANCELABLE_METHOD
+
   void CloseContextAsync(const CloseContextRequest* request,
                          CloseContextResponse* response,
                          StatusCallback done) override {
@@ -164,19 +179,8 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
-  void RunComponentFunctionAsync(CallOptions* call_opts,
-                                 const RunComponentFunctionRequest* request,
-                                 RunComponentFunctionResponse* response,
-                                 StatusCallback done) override {
-    StatusCallback done_wrapped = callback_wrapper(std::move(done));
-    new RPCState<protobuf::Message>(
-        &stub_, cq_, "/tensorflow.eager.EagerService/RunComponentFunction",
-        *request, response, std::move(done_wrapped), call_opts,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
-  void StreamingEnqueueAsync(const EnqueueRequest* request,
+  void StreamingEnqueueAsync(CallOptions* call_opts,
+                             const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
     StatusCallback done_wrapped = callback_wrapper(std::move(done));
@@ -192,14 +196,16 @@ class GrpcEagerClient : public EagerClient {
                 "/tensorflow.eager.EagerService/StreamingEnqueue"));
         it = it_and_bool.first;
       }
+      // TODO(haoyuzhang): Consider supporting cancellation for streaming RPC?
       it->second.SendNextRequest(*request, response, std::move(done_wrapped));
     } else {
       Notification n;
       Status status;
-      EnqueueAsync(request, response, [&n, &status](const Status& s) {
-        status.Update(s);
-        n.Notify();
-      });
+      EnqueueAsync(call_opts, request, response,
+                   [&n, &status](const Status& s) {
+                     status.Update(s);
+                     n.Notify();
+                   });
       n.WaitForNotification();
       done_wrapped(status);
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 1d65f945f27..fb91eee0673 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -66,12 +66,26 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   }
   HANDLER(CreateContext);
   HANDLER(UpdateContext);
-  HANDLER(Enqueue);
   HANDLER(WaitQueueDone);
   HANDLER(KeepAlive);
   HANDLER(CloseContext);
 #undef HANDLER
 
+  void EnqueueHandler(EagerCall<EnqueueRequest, EnqueueResponse>* call) {
+    env_->compute_pool->Schedule([this, call]() {
+      auto call_opts = std::make_shared<CallOptions>();
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      call->SendResponse(ToGrpcStatus(local_impl_.Enqueue(
+          call_opts.get(), &call->request, &call->response)));
+    });
+    Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService, EnqueueRequest,
+         EnqueueResponse>::
+        EnqueueRequest(&service_, cq_.get(),
+                       &grpc::EagerService::AsyncService::RequestEnqueue,
+                       &GrpcEagerServiceImpl::EnqueueHandler,
+                       /*supports_cancel=*/true);
+  }
+
   void RunComponentFunctionHandler(
       EagerCall<RunComponentFunctionRequest, RunComponentFunctionResponse>*
           call) {
@@ -116,7 +130,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
       // reuse the same StreamingCall for multiple requests in the same
       // streaming connection.
       Status status = local_impl_.Enqueue(
-          &call->request(), call->mutable_response(),
+          /*call_opts=*/nullptr, &call->request(), call->mutable_response(),
           reinterpret_cast<uint64>(static_cast<void*>(call)));
 
       if (status.ok()) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index b96baf93e03..b5ea1ebabde 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -140,6 +140,15 @@ static void EncodeSkeleton(const Tensor& val, io::ProtoEncodeHelper* e) {
 void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val, bool require_ack,
                               ::grpc::ByteBuffer* result) {
   const int kLargeTensorBytes = 1024;
+  const int64 kProtoBufLimitBytes = 1LL << 31;
+
+  if (val.TotalBytes() > kProtoBufLimitBytes) {
+    size_t exceeded_bytes = val.TotalBytes() - kProtoBufLimitBytes;
+    LOG(FATAL) << "Cannot encode a Tensor that exceeds the 2GB protobuf limit. "
+                  "Exceeded bytes: "
+               << exceeded_bytes;
+  }
+
   RecvTensorResponse response;
   if (is_dead) {
     response.set_is_dead(is_dead);
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 0c3ef6ab075..62a67b5a3c0 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -47,10 +47,11 @@ RpcCollectiveExecutorMgr::~RpcCollectiveExecutorMgr() {
 
 CollectiveExecutor* RpcCollectiveExecutorMgr::Create(int64 step_id) {
   CollectiveRemoteAccessDistributed* rma =
-      new CollectiveRemoteAccessDistributed(
-          dev_mgr_, dev_resolver_.get(), work_queue_, worker_cache_, step_id);
+      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
+                                            work_queue_, worker_cache_, step_id,
+                                            task_name_);
   return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
-                                    &gpu_ring_order_);
+                                    &gpu_ring_order_, work_queue_);
 }
 
 namespace {
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 1842b04e4f0..651b48772f9 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -602,6 +602,7 @@ cc_library(
         ":numeric_types",
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:types",
+        "//third_party/eigen3",
     ],
     alwayslink = 1,
 )
@@ -614,7 +615,6 @@ cc_library(
         "//tensorflow/core:__subpackages__",
     ],
     deps = [
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/platform:types",
         "//third_party/eigen3",
     ],
@@ -1097,9 +1097,9 @@ tf_cc_tests(
     ],
 )
 
-filegroup(
+cc_library(
     name = "pywrap_required_hdrs",
-    srcs = [
+    textual_hdrs = [
         "op_gen_lib.h",
         "rendezvous.h",
     ],
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index d20f779c8da..d032276d0f7 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -56,9 +56,7 @@ Allocator::~Allocator() {}
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
-void EnableCPUAllocatorFullStats(bool enable) {
-  cpu_allocator_collect_full_stats = enable;
-}
+void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; }
 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 string AllocatorAttributes::DebugString() const {
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 7b8eba0fda9..f7402f7b293 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -410,14 +410,17 @@ Allocator* cpu_allocator_base();
 // call it directly.
 Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
 
-// If 'enable' is true, the default CPU allocator implementation will collect
-// AllocatorStats. By default, it's disabled.
-void EnableCPUAllocatorStats(bool enable);
+// Enables AllocatorStats in the default CPU allocator implementation.  By
+// default, it's disabled.
+void EnableCPUAllocatorStats();
+// Disables AllocatorStats in the default CPU allocator implementation.  By
+// default, it's disabled.
+void DisableCPUAllocatorStats();
 bool CPUAllocatorStatsEnabled();
 
-// If 'enable' is true, the default CPU allocator implementation will collect
-// full statistics. By default, it's disabled.
-void EnableCPUAllocatorFullStats(bool enable);
+// Enables full statistics collection in the default CPU allocator
+// implementation.  By default, it's disabled.
+void EnableCPUAllocatorFullStats();
 bool CPUAllocatorFullStatsEnabled();
 
 // An object that does the underlying suballoc/free of memory for a higher-level
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 3caab02eeba..0ac3da1a19c 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -133,7 +133,7 @@ TEST(AllocatorAttributesDeathTest, MergeDifferentScopeIds) {
 }
 
 TEST(CPUAllocatorTest, Simple) {
-  EnableCPUAllocatorStats(true);
+  EnableCPUAllocatorStats();
   Allocator* a = cpu_allocator();
   std::vector<void*> ptrs;
   for (int s = 1; s < 1024; s++) {
@@ -162,7 +162,7 @@ TEST(CPUAllocatorTest, Simple) {
              1048576 * sizeof(double));
   a->ClearStats();
   CheckStats(a, 0, 0, 0, 0);
-  EnableCPUAllocatorStats(false);
+  DisableCPUAllocatorStats();
 }
 
 // Define a struct that we will use to observe behavior in the unit tests
@@ -227,13 +227,13 @@ static void BM_Allocation(int iters, int arg) {
   std::vector<int> sizes = {256, 4096, 16384, 524288, 512, 1048576};
   int size_index = 0;
 
-  if (arg) EnableCPUAllocatorStats(true);
+  if (arg) EnableCPUAllocatorStats();
   while (--iters > 0) {
     int bytes = sizes[size_index++ % sizes.size()];
     void* p = a->AllocateRaw(1, bytes);
     a->DeallocateRaw(p);
   }
-  if (arg) EnableCPUAllocatorStats(false);
+  if (arg) DisableCPUAllocatorStats();
 }
 BENCHMARK(BM_Allocation)->Arg(0)->Arg(1);
 
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 6025be51704..c79b0dfb672 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -15,36 +15,34 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
+#include "third_party/eigen3/Eigen/Core"
+
 namespace tensorflow {
 
+void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
+  Eigen::Map<const Eigen::ArrayXf> src_eigen(src, size);
+  Eigen::Map<Eigen::Array<bfloat16, Eigen::Dynamic, 1>> dst_eigen(dst, size);
+  dst_eigen = src_eigen.cast<bfloat16>();
+}
+
 void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
-  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
+  for (; size != 0; src++, dst++, size--) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  for (; size != 0; p += 2, q++, size--) {
-    *q = p[0];
-  }
+    memcpy(dst, src, sizeof(bfloat16));
 #else
-  for (; size != 0; p += 2, q++, size--) {
-    *q = p[1];
-  }
+    memcpy(
+        dst,
+        reinterpret_cast<const char*>(src) + sizeof(float) - sizeof(bfloat16),
+        sizeof(bfloat16));
 #endif
+  }
 }
 
 void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
-  uint16_t* q = reinterpret_cast<uint16_t*>(dst);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  for (; size != 0; p++, q += 2, size--) {
-    q[0] = *p;
-    q[1] = 0;
-  }
-#else
-  for (; size != 0; p++, q += 2, size--) {
-    q[0] = 0;
-    q[1] = *p;
-  }
-#endif
+  Eigen::Map<const Eigen::Array<bfloat16, Eigen::Dynamic, 1>> src_eigen(src,
+                                                                        size);
+  Eigen::Map<Eigen::ArrayXf> dst_eigen(dst, size);
+  dst_eigen = src_eigen.cast<float>();
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index cd608ad9a4c..0d1a074cccf 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -48,9 +48,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Conversion routines between an array of float and bfloat16 of
-// "size".
+// Convert from float to bfloat16 with rounding-to-nearest-even.
+void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64 size);
+// Convert from float to bfloat16 with truncation. Notice this conversion is
+// lossy since it truncates the float to 7 mantissa bits without rounding.
 void FloatToBFloat16(const float* src, bfloat16* dst, int64 size);
+// Convert from bfloat16 to float. This conversion is lossless.
 void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index fe1296f19fe..0de298cfce8 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -23,140 +23,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-TEST(Bfloat16Test, ZeroRepresentations) {
-  ASSERT_EQ(bfloat16{0.0f}, bfloat16{0.0f});
-  ASSERT_EQ(bfloat16{-0.0f}, bfloat16{0.0f});
-  ASSERT_EQ(bfloat16{-0.0f}, bfloat16{-0.0f});
-  ASSERT_EQ(bfloat16{0.0f}.value, 0x0000);
-  ASSERT_EQ(bfloat16{-0.0f}.value, 0x8000);
-}
-
-TEST(Bfloat16Test, FlushDenormalsToZero) {
-  for (float denorm = -std::numeric_limits<float>::denorm_min();
-       denorm < std::numeric_limits<float>::denorm_min();
-       denorm = std::nextafterf(denorm, 1.0f)) {
-    bfloat16 bf_trunc =
-        bfloat16(Eigen::bfloat16_impl::truncate_to_bfloat16(denorm));
-    ASSERT_EQ(static_cast<float>(bf_trunc), 0.0f);
-    if (std::signbit(denorm)) {
-      ASSERT_EQ(bf_trunc.value, 0x8000) << denorm;
-    } else {
-      ASSERT_EQ(bf_trunc.value, 0x0000) << denorm;
-    }
-    bfloat16 bf_round(denorm);
-    ASSERT_EQ(static_cast<float>(bf_round), 0.0f);
-    if (std::signbit(denorm)) {
-      ASSERT_EQ(bf_round.value, 0x8000) << denorm;
-    } else {
-      ASSERT_EQ(bf_round.value, 0x0000) << denorm;
-    }
-  }
-}
-
-TEST(Bfloat16Test, DefaultValueIsZero) {
-  EXPECT_EQ(0.0f, static_cast<float>(bfloat16()));
-}
-
-TEST(Bfloat16Test, RepresentableFloatsRoundTripViaBfloat16) {
-  const std::vector<float> values = {
-      -std::numeric_limits<float>::infinity(), -1.0, -0.5, -0.0, 0.0, 0.5, 1.0,
-      std::numeric_limits<float>::infinity(),
-  };
-  for (float v : values) {
-    EXPECT_EQ(v, static_cast<float>(static_cast<bfloat16>(v)));
-  }
-}
-
-TEST(Bfloat16Test, Simple) {
-  bfloat16 a(12);
-  // Floating point representation of 12: 0x41400000
-  EXPECT_EQ(0x4140, a.value);
-}
-
-float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
-                    uint32_t low_mantissa) {
-  return absl::bit_cast<float>((sign << 31) + (exponent << 23) +
-                               (high_mantissa << 16) + low_mantissa);
-}
-
-struct Bfloat16TestParam {
-  float input;
-  float expected_truncation;
-  float expected_rounding;
-};
-
-class Bfloat16Test : public ::testing::Test,
-                     public ::testing::WithParamInterface<Bfloat16TestParam> {};
-
-TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 truncated =
-      bfloat16(Eigen::bfloat16_impl::truncate_to_bfloat16((GetParam().input)));
-
-  if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
-    return;
-  }
-
-  EXPECT_EQ(GetParam().expected_truncation, float(truncated));
-
-  bfloat16 rounded(GetParam().input);
-  if (std::isnan(GetParam().input)) {
-    EXPECT_TRUE(std::isnan(float(rounded)) || std::isinf(float(rounded)));
-    return;
-  }
-  EXPECT_EQ(GetParam().expected_rounding, float(rounded));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    Bfloat16Test_Instantiation, Bfloat16Test,
-    ::testing::Values(
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001001, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1111010111000011),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000001),
-            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000),
-            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b1111111111111111),
-            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000),
-            BinaryToFloat(0, 0b11111111, 0b1000000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1100000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(1, 0b10000000, 0b1001001, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0100000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
-            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
-        // The following two floats are denormals and will be flushed
-        // to zero.
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b00000000, 0b1001000, 0b1000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000)},
-        Bfloat16TestParam{
-            BinaryToFloat(0, 0b00000000, 0b1111111, 0b1100000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000),
-            BinaryToFloat(0, 0b00000000, 0b0000000, 0b0000000000000000)}));
-
 TEST(Bfloat16Test, Conversion) {
   float a[100];
   for (int i = 0; i < 100; ++i) {
@@ -173,21 +39,6 @@ TEST(Bfloat16Test, Conversion) {
   }
 }
 
-TEST(Bfloat16Test, Epsilon) {
-  EXPECT_LT(1.0f,
-            static_cast<float>(Eigen::NumTraits<Eigen::bfloat16>::epsilon() +
-                               bfloat16(1.0f)));
-  EXPECT_EQ(1.0f,
-            static_cast<float>((Eigen::NumTraits<Eigen::bfloat16>::epsilon() /
-                                bfloat16(2.0f)) +
-                               bfloat16(1.0f)));
-}
-
-TEST(Bfloat16Test, Negate) {
-  EXPECT_EQ(-3.0f, static_cast<float>(-bfloat16(3.0f)));
-  EXPECT_EQ(4.5f, static_cast<float>(-bfloat16(-4.5f)));
-}
-
 static void BM_FloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
@@ -207,12 +58,6 @@ static void BM_FloatToBFloat16(int iters) {
 }
 BENCHMARK(BM_FloatToBFloat16);
 
-void RoundFloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
-  for (; size != 0; size--) {
-    dst[size] = bfloat16(src[size]);
-  }
-}
-
 static void BM_RoundFloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index e7110d9512c..d0c53231403 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -111,8 +111,16 @@ struct CollInstanceParams {
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
-  std::vector<string> devices;   // all_permute only
-  std::vector<int> permutation;  // all_permute only
+  std::vector<string> devices;  // permuter only
+
+  // For permuter only
+  // Each rank in the permutation is a receiver.
+  // Indices of each rank means a sender to that rank.
+  // Example: permutation = {2,0,1} means
+  //   rank 0 sends to rank 2
+  //   rank 1 sends to rank 0
+  //   rank 2 sends to rank 1
+  std::vector<int> permutation;
 };
 
 // Data common to all instance members in the same task.
@@ -153,17 +161,20 @@ class DeviceResolverInterface {
       std::vector<DeviceAttributes>* attributes,
       const StatusCallback& done) = 0;
 
-  // Populate *attributes with the DeviceAttributes of the specified
-  // device.
+  // Populates *attributes with the DeviceAttributes of the specified device.
   virtual void GetDeviceAttributesAsync(const string& device,
                                         const string& task,
                                         DeviceAttributes* attributes,
                                         const StatusCallback& done) = 0;
 
-  // Clear the cache of device data belonging to the specified task.
+  // Returns the cached device attributes of a task.
+  virtual Status GetTaskCached(const string& task,
+                               std::vector<DeviceAttributes>* attributes) = 0;
+
+  // Clears the cache of device data belonging to the specified task.
   virtual void ClearTask(const string& task) = 0;
 
-  // Clear the cache of all device data.
+  // Clears the cache of all device data.
   virtual void ClearCache() = 0;
 };
 
@@ -250,9 +261,9 @@ class CollectiveExecutorMgrInterface : public StepSequenceInterface {
 // with peers.  Note that data exchange is currently limited to types
 // for which DMAHelper::CanUseDMA() returns true, i.e.  dense numeric
 // types.
-class PeerAccessInterface {
+class CollectiveRemoteAccess {
  public:
-  virtual ~PeerAccessInterface() {}
+  virtual ~CollectiveRemoteAccess() {}
 
   virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
                             bool peer_is_local, const string& key,
@@ -271,15 +282,20 @@ class PeerAccessInterface {
                           const DeviceLocality& client_locality,
                           const StatusCallback& done) = 0;
 
-  // Runs the potentially-blocking closure/expensive callback.
-  virtual void RunClosure(std::function<void()> closure) = 0;
-};
+  // Checks the health of a collective peer. It probes the peer to see if it is
+  // alive. Note that if a peer has restarted, it's considered a different one,
+  // so CheckPeerHealth fails.
+  virtual void CheckPeerHealth(const string& peer_task,
+                               const StatusCallback& done) = 0;
 
-class PerStepCollectiveRemoteAccess;
+  virtual BufRendezvous* buf_rendezvous() = 0;
+
+  virtual void StartAbort(const Status& s) = 0;
+};
 
 // A step-specific object that can execute a collective operation completely
 // described by a CollectiveParams object.
-class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
+class CollectiveExecutor : public core::RefCounted {
  public:
   virtual void StartAbort(const Status& s) {}
 
@@ -299,7 +315,10 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
         "a CollectiveExecutor has not been provided."));
   }
 
-  virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
+  // Runs the potentially-blocking closure/expensive callback.
+  virtual void RunClosure(std::function<void()> closure) = 0;
+
+  virtual CollectiveRemoteAccess* remote_access() { return nullptr; }
 
   // `WaitForDependencies` and `Launched` are used for fine-grained control of
   // execution order between collective instances.  These functions are intended
@@ -343,24 +362,6 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
 };
 
-// Interface of a helper object that provides a CollectiveExecutor with
-// all of the remote access it needs.
-class CollectiveRemoteAccess : public PeerAccessInterface,
-                               public DeviceResolverInterface {
- public:
-  virtual ~CollectiveRemoteAccess() {}
-
-  virtual BufRendezvous* buf_rendezvous() = 0;
-};
-
-// A per-step version of CollectiveRemoteAccess that cleans up outstanding
-// communications in case step execution is abandoned.
-class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess {
- public:
-  virtual ~PerStepCollectiveRemoteAccess() {}
-  virtual void StartAbort(const Status& s) = 0;
-};
-
 class CollectiveContext {
  public:
   CollectiveContext(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc
index 814233074fb..511cfce8ab5 100644
--- a/tensorflow/core/framework/cpu_allocator_impl.cc
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@@ -29,9 +29,8 @@ namespace tensorflow {
 // If true, cpu allocator collects more stats.
 static bool cpu_allocator_collect_stats = false;
 
-void EnableCPUAllocatorStats(bool enable) {
-  cpu_allocator_collect_stats = enable;
-}
+void EnableCPUAllocatorStats() { cpu_allocator_collect_stats = true; }
+void DisableCPUAllocatorStats() { cpu_allocator_collect_stats = false; }
 bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 
 static const int kMaxTotalAllocationWarnings = 1;
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index dcae9ab3ef3..2c4e615b193 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -554,6 +554,64 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
   return s;
 }
 
+Status DatasetBaseIterator::Skip(IteratorContext* ctx, int num_to_skip,
+                                 bool* end_of_sequence, int* num_skipped) {
+  profiler::TraceMe activity([&] { return BuildTraceMeName(); },
+                             profiler::TraceMeLevel::kInfo);
+  DVLOG(3) << prefix() << " Skip enter";
+  auto model = ctx->model();
+  if (model && model->collect_resource_usage() && node_) {
+    int64 now_nanos = EnvTime::NowNanos();
+    auto output = node_->output();
+    if (output) {
+      output->record_stop(now_nanos);
+    }
+    node_->record_start(now_nanos);
+  }
+  Status s = SkipInternal(ctx, num_to_skip, end_of_sequence, num_skipped);
+  if (model && model->collect_resource_usage() && node_) {
+    int64 now_nanos = EnvTime::NowNanos();
+    node_->record_stop(now_nanos);
+    auto output = node_->output();
+    if (output) {
+      output->record_start(now_nanos);
+    }
+  }
+  if (TF_PREDICT_FALSE(errors::IsOutOfRange(s))) {
+    s = errors::Internal("Iterator \"", params_.prefix,
+                         "\" returned `OutOfRange`. This indicates an "
+                         "implementation error as `OutOfRange` errors are not "
+                         "expected to be returned here. Original message: ",
+                         s.error_message());
+    LOG(ERROR) << s;
+  }
+  DVLOG(3) << prefix() << " Skip exit";
+  return s;
+}
+
+Status DatasetBaseIterator::SkipInternal(IteratorContext* ctx, int num_to_skip,
+                                         bool* end_of_sequence,
+                                         int* num_skipped) {
+  *num_skipped = 0;
+  for (int i = 0; i < num_to_skip; ++i) {
+    std::vector<Tensor> out_tensors;
+    TF_RETURN_IF_ERROR(GetNextInternal(ctx, &out_tensors, end_of_sequence));
+    if (*end_of_sequence) {
+      return Status::OK();
+    }
+    // RecordElement is used to count the number of element computed and
+    // help calculate the CPU time spent on a given iterator to do the
+    // autotuning.
+    // Here we only call RecordElement in the default implementation of
+    // SkipInternal (which trivially calls GetNextInternal) and assume
+    // that the overriden SkipInternal in the derived class will have
+    // negligible cost compare to its GetNextInternal.
+    RecordElement(ctx, &out_tensors);
+    (*num_skipped)++;
+  }
+  return Status::OK();
+}
+
 void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset = nullptr;
   MakeDataset(ctx, &dataset);
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 25dc10e8540..8c35b1909ca 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -595,6 +595,16 @@ class IteratorBase {
     return GetNext(&ctx, out_tensors, end_of_sequence);
   }
 
+  // Skips the next `num_to_skip` outputs from the range that this iterator
+  // is traversing.
+  //
+  // If there are not enough outputs to skip, it will set
+  // `*end_of_sequence = true` and return `Status::OK()`. `*num_skipped` will
+  // store the number of outputs that are skipped. When `*end_of_sequence` is
+  // `false`, `*num_skipped` should equal to `num_to_skip`.
+  virtual Status Skip(IteratorContext* ctx, int num_to_skip,
+                      bool* end_of_sequence, int* num_skipped) = 0;
+
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
   // iterator.
@@ -904,6 +914,9 @@ class DatasetBaseIterator : public IteratorBase {
     return GetNext(&ctx, out_tensors, end_of_sequence);
   }
 
+  Status Skip(IteratorContext* ctx, int num_to_skip, bool* end_of_sequence,
+              int* num_skipped) final;
+
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) final {
     return IteratorBase::Save(ctx, writer);
   }
@@ -914,6 +927,10 @@ class DatasetBaseIterator : public IteratorBase {
                                  std::vector<Tensor>* out_tensors,
                                  bool* end_of_sequence) = 0;
 
+  // Internal implementation of Skip that is wrapped in tracing logic
+  virtual Status SkipInternal(IteratorContext* ctx, int num_to_skip,
+                              bool* end_of_sequence, int* num_skipped);
+
   string full_name(const string& name) const {
     if (str_util::StrContains(name, kColon)) {
       LOG(ERROR) << name << " should not contain " << kColon;
@@ -1171,20 +1188,6 @@ class DatasetOpRegistrar {
       registrar__body__##ctr##__object(op_name)
 
 }  // namespace data
-
-// TODO(b/114112161): Remove these aliases when all users have moved over to the
-// `tensorflow::data` namespace.
-using data::DatasetBase;
-using data::DatasetContext;
-using data::DatasetIterator;
-using data::DatasetOpKernel;
-using data::IteratorBase;
-using data::IteratorContext;
-using data::IteratorStateReader;
-using data::IteratorStateWriter;
-using data::SerializationContext;
-using data::UnaryDatasetOpKernel;
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index c7e6e2d158c..3c7c09eee37 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -612,6 +612,9 @@ class FunctionLibraryRuntime {
     // infer correct device.
     std::vector<string> output_devices;
 
+    // If set, it indicates the original output indices of a component function.
+    absl::optional<std::vector<int>> ret_indices = absl::nullopt;
+
     // Maps from a CompositeDevice name to a list of underlying physical
     // devices.
     absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 10313eb8feb..cef2f562515 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 // clang-format on
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index adc52d963c9..94b98d5aff6 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -313,6 +313,7 @@ struct OpDefBuilderReceiver {
 #define REGISTER_OP(name) REGISTER_OP_UNIQ_HELPER(__COUNTER__, name)
 #define REGISTER_OP_UNIQ_HELPER(ctr, name) REGISTER_OP_UNIQ(ctr, name)
 #define REGISTER_OP_UNIQ(ctr, name)                                          \
+  TF_ATTRIBUTE_ANNOTATE("tf:op")                                             \
   static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr    \
       TF_ATTRIBUTE_UNUSED =                                                  \
           ::tensorflow::register_op::OpDefBuilderWrapper<SHOULD_REGISTER_OP( \
@@ -326,6 +327,8 @@ struct OpDefBuilderReceiver {
 #define REGISTER_SYSTEM_OP_UNIQ_HELPER(ctr, name) \
   REGISTER_SYSTEM_OP_UNIQ(ctr, name)
 #define REGISTER_SYSTEM_OP_UNIQ(ctr, name)                                \
+  TF_ATTRIBUTE_ANNOTATE("tf:op")                                          \
+  TF_ATTRIBUTE_ANNOTATE("tf:op:system")                                   \
   static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr \
       TF_ATTRIBUTE_UNUSED =                                               \
           ::tensorflow::register_op::OpDefBuilderWrapper<true>(name)
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 3bfcedaee82..0116a1f8825 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1457,6 +1457,7 @@ class Name : public KernelDefBuilder {
 #define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
   constexpr bool should_register_##ctr##__flag =                      \
       SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__);                        \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                                  \
   static ::tensorflow::kernel_factory::OpKernelRegistrar              \
       registrar__body__##ctr##__object(                               \
           should_register_##ctr##__flag                               \
@@ -1479,6 +1480,8 @@ class Name : public KernelDefBuilder {
   REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
 
 #define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)    \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                                     \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel:system")                              \
   static ::tensorflow::kernel_factory::OpKernelRegistrar                 \
       registrar__body__##ctr##__object(                                  \
           ::tensorflow::register_kernel::system::kernel_builder.Build(), \
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 3c4c186b791..0c57362703d 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -113,6 +113,12 @@ static const char* const kMklOpPrefix = "_Mkl";
 // through template parameter.
 static const char* const kMklEagerOpPrefix = "_MklEager";
 
+// Prefix that we add to TF op name to construct MKL op that does not
+// depend on layout propagation. It will be used in both Eager and graph
+// modes unless there is a reason to have additional op name with
+// _MklEager prefix.
+static const char* const kMklNativeOpPrefix = "_MklNative";
+
 // Get the name of Mkl op from original TensorFlow op
 // We prefix 'Mkl' to the original op to get Mkl op.
 inline string GetMklOpName(const string& name) {
@@ -125,6 +131,12 @@ inline string GetMklEagerOpName(const string& name) {
   return string(kMklEagerOpPrefix) + name;
 }
 
+// Get the name of Mkl Native (does not depend on layout propagation) op
+// from original TensorFlow op.
+inline string GetMklNativeOpName(const string& name) {
+  return string(kMklNativeOpPrefix) + name;
+}
+
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
 static inline bool IsBF16SupportedByOneDNNOnThisCPU() {
   return port::TestCPUFeature(port::CPUFeature::AVX512F);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 0f415cf0392..d391e15de1a 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -103,9 +103,9 @@ class Cluster {
   // superset of the devices listed in GetDevices/GetDeviceNames().
   virtual const DeviceSet* GetDeviceSet() const { return nullptr; }
 
-  // Enables collecting the allocator stats. Call with enable=true must be made
-  // before Provision().
-  virtual Status EnablePeakMemoryStats(bool enable) {
+  // Enables collecting the allocator stats. If called, must be called before
+  // Provision().
+  virtual Status EnablePeakMemoryStats() {
     return errors::Unimplemented(strings ::StrCat(
         "Peak Memory Stats are not supported on ", type(), " clusters"));
   }
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index c44b74efcdc..678daed02e4 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -202,9 +202,9 @@ Status SingleMachine::Run(const GraphDef& graph_def,
   return Status::OK();
 }
 
-Status SingleMachine::EnablePeakMemoryStats(bool enable) {
-  EnableCPUAllocatorStats(enable);
-  cpu_allocator_stats_enabled_ = enable;
+Status SingleMachine::EnablePeakMemoryStats() {
+  EnableCPUAllocatorStats();
+  cpu_allocator_stats_enabled_ = true;
   // No need to enable GPU allocator stats since its stats are always collected.
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index 9e085d161b6..48f56940ec4 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -45,7 +45,7 @@ class SingleMachine : public Cluster {
 
   const DeviceSet* GetDeviceSet() const override { return device_set_.get(); }
 
-  Status EnablePeakMemoryStats(bool enable) override;
+  Status EnablePeakMemoryStats() override;
 
   // It requires EnableAllocatorStats(true) be called before Provision().
   Status GetPeakMemoryUsage(
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index a85e4e17748..d0d525e0222 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -51,7 +51,7 @@ class SingleMachineTest : public ::testing::Test {
 #endif
     cluster_.reset(
         new SingleMachine(timeout_s, 3 /* num_cpu_cores */, 0 /* num_gpus */));
-    TF_CHECK_OK(cluster_->EnablePeakMemoryStats(true));
+    TF_CHECK_OK(cluster_->EnablePeakMemoryStats());
     TF_CHECK_OK(cluster_->Provision());
   }
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index e148f6a61c8..a52160b270d 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -96,6 +96,7 @@ constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
 constexpr char kUnpack[] = "Unpack";
 constexpr char kSoftmax[] = "Softmax";
+constexpr char kResizeBilinear[] = "ResizeBilinear";
 // Dynamic control flow ops.
 constexpr char kSwitch[] = "Switch";
 constexpr char kMerge[] = "Merge";
@@ -506,6 +507,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad));
   device_cost_impl_.emplace(kSoftmax,
                             wrap(&OpLevelCostEstimator::PredictSoftmax));
+  device_cost_impl_.emplace(kResizeBilinear,
+                            wrap(&OpLevelCostEstimator::PredictResizeBilinear));
   device_cost_impl_.emplace(
       kAssignVariableOp, wrap(&OpLevelCostEstimator::PredictAssignVariableOps));
   device_cost_impl_.emplace(
@@ -2315,5 +2318,95 @@ Costs OpLevelCostEstimator::PredictSoftmax(const OpContext& op_context) const {
   return PredictOpCountBasedCost(ops, op_context.op_info);
 }
 
+Costs OpLevelCostEstimator::PredictResizeBilinear(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+
+  const int64 input_size =
+      CalculateTensorSize(op_context.op_info.inputs(0), &found_unknown_shapes);
+  const int64 output_size =
+      CalculateTensorSize(op_context.op_info.outputs(0), &found_unknown_shapes);
+  const int output_elements = CalculateTensorElementCount(
+      op_context.op_info.outputs(0), &found_unknown_shapes);
+
+  const auto half_pixel_centers =
+      op_context.op_info.attr().find("half_pixel_centers");
+  bool use_half_pixel_centers = false;
+  if (half_pixel_centers == op_context.op_info.attr().end()) {
+    LOG(WARNING) << "half_pixel_centers attr not set for ResizeBilinear.";
+    return PredictCostOfAnUnknownOp(op_context);
+  } else {
+    use_half_pixel_centers = half_pixel_centers->second.b();
+  }
+
+  // Compose cost of bilinear interpolation.
+  auto ops = 0;
+
+#define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
+  const auto sub_cost_float = EIGEN_COST(scalar_difference_op<float>);
+  const auto sub_cost_int = EIGEN_COST(scalar_difference_op<int64>);
+  const auto add_cost = EIGEN_COST(scalar_sum_op<float>);
+  const auto mul_cost = EIGEN_COST(scalar_product_op<float>);
+  const auto floor_cost = EIGEN_COST(scalar_floor_op<float>);
+  const auto max_cost = EIGEN_COST(scalar_max_op<int64>);
+  const auto min_cost = EIGEN_COST(scalar_min_op<int64>);
+  const auto cast_to_int_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_cast_op<float, int64>>::Cost;
+  const auto cast_to_float_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_cast_op<int64, float>>::Cost;
+  const auto ceil_cost = EIGEN_COST(scalar_ceil_op<float>);
+#undef EIGEN_COST
+
+  // Ops calcualted from tensorflow/core/kernels/image/resize_bilinear_op.cc.
+
+  // Op counts taken from resize_bilinear implementation at cl/322475933.
+  // Computed op counts may become inaccurate if resize_bilinear implementation
+  // changes.
+
+  // resize_bilinear has an optimization where the interpolation weights are
+  // precomputed and cached. Given input tensors of size [B,H1,W1,C] and output
+  // tensors of size [B,H2,W2,C], the last dimension C that needs to be accessed
+  // in the input for interpolation are identical at every point in the output.
+  // These values are cached in the compute_interpolation_weights function. For
+  // a particular y in [0...H2-1], the rows to be accessed in the input are the
+  // same. Likewise, for a particular x in [0...H2-1], the columns to be accsed
+  // are the same. So the precomputation only needs to be done for H2 + W2
+  // values.
+  const auto output_shape = MaybeGetMinimumShape(
+      op_context.op_info.outputs(0).shape(), 4, &found_unknown_shapes);
+  // Assume H is dim 1 and W is dim 2 to match logic in resize_bilinear, which
+  // also makes this assumption.
+  const int64 output_height = output_shape.dim(1).size();
+  const int64 output_width = output_shape.dim(2).size();
+  // Add the ops done outside of the scaler function in
+  // compute_interpolation_weights.
+  int64 interp_weight_cost = floor_cost + max_cost + min_cost + sub_cost_float +
+                             sub_cost_int + ceil_cost + cast_to_int_cost * 2;
+  // There are two options for computing the weight of each pixel in the
+  // interpolation. Algorithm can use pixel centers, or corners, for the
+  // weight. Ops depend on the scaler function passed into
+  // compute_interpolation_weights.
+  if (use_half_pixel_centers) {
+    // Ops for HalfPixelScalaer.
+    interp_weight_cost +=
+        add_cost + mul_cost + sub_cost_float + cast_to_float_cost;
+  } else {
+    // Ops for LegacyScaler.
+    interp_weight_cost += cast_to_float_cost + mul_cost;
+  }
+  // Cost for the interpolation is multipled by (H2 + w2), as mentioned above.
+  ops += interp_weight_cost * (output_height + output_width);
+
+  // Ops for computing the new values, done for every element. Logic is from
+  // compute_lerp in the inner loop of resize_image which consists of:
+  //   const float top = top_left + (top_right - top_left) * x_lerp;
+  //   const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  //   return top + (bottom - top) * y_lerp;
+  ops += (add_cost * 3 + sub_cost_float * 3 + mul_cost * 3) * output_elements;
+
+  return PredictOpCountBasedCost(ops, input_size, output_size,
+                                 op_context.op_info);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index be0d7f76621..69d2bd40e1a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -89,6 +89,7 @@ class OpLevelCostEstimator {
   Costs PredictAssignVariableOps(const OpContext& op_context) const;
   Costs PredictPureMemoryOp(const OpContext& op_context) const;
   Costs PredictSoftmax(const OpContext& op_context) const;
+  Costs PredictResizeBilinear(const OpContext& op_context) const;
 
   // Generic cost prediction method for fused operations.
   Costs PredictFusedOp(const OpContext& op_context,
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 5ddefdc9602..0b62251e411 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -1974,5 +1974,101 @@ TEST_F(OpLevelCostEstimatorTest, PureMemoryOpExecutionTime) {
     EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
   }
 }
+TEST_F(OpLevelCostEstimatorTest, ResizeBilinearExecutionTime) {
+  const int kImageDim = 255;
+  const int kChannelSize = 10;
+  const int kComputeLerpCost = 9;
+  {
+    // Test with size 0 output.
+    OpContext op_context;
+    SetCpuDevice(&op_context.op_info);
+    op_context.op_info.set_op("ResizeBilinear");
+
+    DescribeTensor4D(1, kImageDim, kImageDim, kChannelSize,
+                     op_context.op_info.add_inputs());
+    const int kExpectedMemoryTime = kImageDim * kImageDim * 4;
+    DescribeTensor4D(0, 0, 0, 0, op_context.op_info.add_outputs());
+
+    // As the half_pixel_centers attr was not set, cost should be inaccurate
+    // with 0 compute time.
+    auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(0));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_TRUE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(false);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+    cost = PredictCosts(op_context);
+    // Compute time depends only on output size, so compute time is 0.
+    EXPECT_EQ(cost.compute_time, Costs::Duration(0));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
+
+  // Test with non-zero output size.
+  const int kOutputImageDim = 100;
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("ResizeBilinear");
+  DescribeTensor4D(1, kImageDim, kImageDim, kChannelSize,
+                   op_context.op_info.add_inputs());
+  DescribeTensor4D(1, kOutputImageDim, kOutputImageDim, kChannelSize,
+                   op_context.op_info.add_outputs());
+  const int kExpectedMemoryTime =
+      (kImageDim * kImageDim + kOutputImageDim * kOutputImageDim) * 4;
+
+  {
+    // Cost of calculating weights without using half_pixel_centers.
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(false);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+    const int kInterpWeightCost = 10;
+    const int num_ops =
+        kInterpWeightCost * (kOutputImageDim * 2) +
+        kComputeLerpCost * (kOutputImageDim * kOutputImageDim * kChannelSize);
+    const int expected_compute_time = std::ceil(
+        num_ops /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    const auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(kExpectedMemoryTime + expected_compute_time));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
+
+  {
+    // Cost of calculating weights using half_pixel_centers.
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(true);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+    const int kInterpWeightCost = 12;
+    const int num_ops =
+        kInterpWeightCost * (kOutputImageDim * 2) +
+        kComputeLerpCost * (kOutputImageDim * kOutputImageDim * kChannelSize);
+    const int expected_compute_time = std::ceil(
+        num_ops /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    const auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(kExpectedMemoryTime));
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(kExpectedMemoryTime + expected_compute_time));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 2ce037178b9..9d2925e8452 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -534,10 +534,7 @@ cc_library(
 tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
-        "no_windows",  # b/56402646
-    ],
+    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
     deps = [
         ":gpu_swapping_kernels",
         ":gpu_swapping_ops",
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 805a7de9225..7902700fb0f 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -127,11 +127,6 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "GRUBlockCellGrad",
         "LSTMBlockCell",
         "LSTMBlockCellGrad",
-        // TODO(benbarsdell): Enable these when fast and safe fp16 kernels are
-        // available for depthwise convolutions.
-        // "DepthwiseConv2dNative",
-        // "DepthwiseConv2dNativeBackpropFilter",
-        // "DepthwiseConv2dNativeBackpropInput",
         "MatMul",
     };
     if (cuda_version_ >= 9010) {
@@ -147,6 +142,11 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
+    if (cudnn_version_ >= 8000) {
+      list.insert("DepthwiseConv2dNative");
+      list.insert("DepthwiseConv2dNativeBackpropFilter");
+      list.insert("DepthwiseConv2dNativeBackpropInput");
+    }
     UpdateList("ALLOWLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
     // TODO(reedwm): This should be removed if we don't have active users.
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 860cbd7c35e..54a21706c37 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -14,6 +14,7 @@ cc_library(
     name = "data",
     visibility = ["//visibility:public"],
     deps = [
+        ":disable_intra_op_parallelism",
         ":filter_fusion",
         ":filter_with_random_uniform_fusion",
         ":hoist_random_uniform",
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index c57bd2c0a5d..4d324ecbd3d 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -42,6 +42,7 @@ constexpr char kShardDatasetOpName[] = "ShardDataset";
 constexpr char kShuffleDatasetOpName[] = "ShuffleDataset";
 constexpr char kShuffleDatasetV2OpName[] = "ShuffleDatasetV2";
 constexpr char kShuffleDatasetV3OpName[] = "ShuffleDatasetV3";
+constexpr char kPrefetchDatasetOpName[] = "PrefetchDataset";
 constexpr char kRebatchDatasetOpName[] = "RebatchDataset";
 constexpr char kRebatchDatasetV2OpName[] = "RebatchDatasetV2";
 
@@ -121,8 +122,8 @@ template <std::size_t SIZE>
 bool IsDatasetNodeOfType(const NodeDef& node,
                          const std::array<const char*, SIZE>& arr) {
   for (const auto& dataset_op_name : arr) {
-    if (tensorflow::data::MatchesAnyVersionRE(/*op_prefix=*/dataset_op_name,
-                                              /*op_to_match=*/node.op())) {
+    if (tensorflow::data::MatchesAnyVersion(/*op_prefix=*/dataset_op_name,
+                                            /*op_to_match=*/node.op())) {
       return true;
     }
   }
@@ -543,16 +544,25 @@ Status RewriteRebatchV2ToV1(const NodeDef& sink_node, int64 num_replicas,
 
 Status ShardByData(const NodeDef& sink_node, int64 num_workers, int64 index,
                    int64 num_replicas, MutableGraphView* graph) {
+  const NodeDef* shard_before = &sink_node;
+  // We sometimes insert a PrefetchDataset at the end of the input pipeline
+  // before autosharding. When sharding by data, we should insert the shard
+  // before the prefetch so that the right number of elements is prefetched.
+  NodeDef* input_node = graph_utils::GetInputNode(sink_node, *graph);
+  if (input_node->op() == kPrefetchDatasetOpName) {
+    shard_before = input_node;
+  }
   // Sharding by data only works with legacy RebatchDataset. As such, we rewrite
   // all instances of RebatchDatasetV2 to RebatchDataset.
-  TF_RETURN_IF_ERROR(RewriteRebatchV2ToV1(sink_node, num_replicas, graph));
-  return AddShardNode(graph, sink_node, num_workers, index);
+  TF_RETURN_IF_ERROR(RewriteRebatchV2ToV1(*shard_before, num_replicas, graph));
+  return AddShardNode(graph, *shard_before, num_workers, index);
 }
 
 Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
                      AutoShardPolicy policy, int64 num_replicas,
                      GraphDef* output) {
-  if (policy == AutoShardPolicy::OFF || (num_workers == 1 && index == 0)) {
+  if (policy == AutoShardPolicy::OFF ||
+      (policy == AutoShardPolicy::FILE && num_workers == 1 && index == 0)) {
     return Status::OK();
   }
 
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
index 4b6d6ac1bfa..ee8f9e84765 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
@@ -29,6 +29,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kRetValOp[] = "_Retval";
 constexpr char kMaxIntraOpParallelismDataset[] = "MaxIntraOpParallelismDataset";
 
 constexpr std::array<const char*, 2> kMaxIntraOpParallelismDatasetOps = {
@@ -44,7 +45,24 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
   *output = item.graph;
   MutableGraphView graph(output);
 
-  const NodeDef* sink_node;
+  for (const auto& fetch_name : item.fetch) {
+    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+    // because we only want to disable intra op parallelism on the main dataset
+    // pipeline.
+    auto fetch = graph.GetNode(fetch_name);
+    if (fetch == nullptr || fetch->op() == kRetValOp) {
+      // Heuristic: If the fetch nodes are Retval ops, this item is from a
+      // function.
+      return Status::OK();
+    }
+  }
+
+  if (item.fetch.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected only one fetch node but there were ", item.fetch.size(), ": ",
+        absl::StrJoin(item.fetch, ", "));
+  }
+
   for (const NodeDef& node : item.graph.node()) {
     for (const auto& target_dataset_op : kMaxIntraOpParallelismDatasetOps) {
       if (node.op() == target_dataset_op) {
@@ -53,11 +71,9 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
         return Status::OK();
       }
     }
-    if (node.name() == "Sink") {
-      sink_node = &node;
-    }
   }
 
+  NodeDef* sink_node = graph.GetNode(item.fetch.at(0));
   NodeDef* last_node = graph_utils::GetInputNode(*sink_node, graph);
 
   // Add a const node with value 1
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
index b1c886594ec..291d77e834c 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -52,29 +52,37 @@ TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
   NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
                                              range_attrs, &graph);
 
-  NodeDef *max_parallelism_val =
+  NodeDef *parallelism_val =
       graph_utils::AddScalarConstNode<int64>(value, &graph);
   std::vector<string> parallelism_inputs(2);
   parallelism_inputs[0] = range_node->name();
-  parallelism_inputs[1] = max_parallelism_val->name();
+  parallelism_inputs[1] = parallelism_val->name();
   std::vector<std::pair<string, AttrValue>> parallelism_attrs;
-  graph_utils::AddNode("", op, parallelism_inputs, parallelism_attrs, &graph);
+  NodeDef *parallelism_node = graph_utils::AddNode(
+      "max_parallelism", op, parallelism_inputs, parallelism_attrs, &graph);
+
+  std::vector<string> sink_inputs(1);
+  sink_inputs[0] = parallelism_node->name();
+  std::vector<std::pair<string, AttrValue>> sink_attrs;
+  NodeDef *sink_node =
+      graph_utils::AddNode("Sink", "Identity", sink_inputs, sink_attrs, &graph);
+  item.fetch.push_back(sink_node->name());
 
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(op, item.graph));
-  EXPECT_EQ(item.graph.node_size(), 6);
-  EXPECT_EQ(max_parallelism_val->attr().at("value").tensor().int64_val(0),
-            value);
+  EXPECT_EQ(item.graph.node_size(), 7);
+  EXPECT_EQ(parallelism_val->attr().at("value").tensor().int64_val(0), value);
 
   DisableIntraOpParallelism optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_EQ(output.node_size(), 6);
+  EXPECT_EQ(output.node_size(), 7);
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(op, output));
-  NodeDef parallelism_node =
+  NodeDef new_parallelism_node =
       output.node(graph_utils::FindGraphNodeWithOp(op, output));
-  NodeDef parallelism_val = output.node(
-      graph_utils::FindGraphNodeWithName(parallelism_node.input(1), output));
-  EXPECT_EQ(parallelism_val.attr().at("value").tensor().int64_val(0), value);
+  NodeDef new_parallelism_val = output.node(graph_utils::FindGraphNodeWithName(
+      new_parallelism_node.input(1), output));
+  EXPECT_EQ(new_parallelism_val.attr().at("value").tensor().int64_val(0),
+            value);
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -84,8 +92,15 @@ INSTANTIATE_TEST_SUITE_P(
                           "ExperimentalMaxIntraOpParallelismDataset"),
         ::testing::Values(1, 5)));
 
-// If the user hasn't set intra op parallelism, we insert the op to disable it.
-TEST(IntraOpNotSetTest, IntraOpParallelism) {
+// Test the case if the user hasn't set intra op parallelism.
+//
+// If we can not find the sink node or sink node op is "_Retval", we don't apply
+// the optimization; otherwise, we insert the op to disable intra op
+// parallelism.
+class IntraOpNotSetTest : public ::testing::TestWithParam<string> {};
+
+TEST_P(IntraOpNotSetTest, IntraOpParallelism) {
+  const string op = GetParam();
   GrapplerItem item;
 
   item.graph = test::function::GDef(
@@ -95,14 +110,28 @@ TEST(IntraOpNotSetTest, IntraOpParallelism) {
        NDef("range", "RangeDataset", {"start", "stop", "step"},
             {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
              {"output_types", gtl::ArraySlice<DataType>{}}}),
-       NDef("Sink", "Identity", {"range"}, {})});
+       NDef("Sink", op, {"range"}, {})});
   EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
                                                item.graph));
   EXPECT_EQ(item.graph.node_size(), 5);
+  item.fetch.push_back("Sink_fake");
 
   DisableIntraOpParallelism optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_FALSE(
+      graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset", output));
+  EXPECT_EQ(output.node_size(), 5);
+
+  item.fetch[0] = "Sink";
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  if (op == "_Retval") {
+    EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
+                                                 output));
+    EXPECT_EQ(output.node_size(), 5);
+    return;
+  }
+
   EXPECT_EQ(output.node_size(), 7);
   EXPECT_TRUE(
       graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset", output));
@@ -121,6 +150,9 @@ TEST(IntraOpNotSetTest, IntraOpParallelism) {
   EXPECT_EQ(parallelism_val.attr().at("value").tensor().int64_val(0), 1);
 }
 
+INSTANTIATE_TEST_SUITE_P(Test, IntraOpNotSetTest,
+                         ::testing::Values("Identity", "_Retval"));
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index fdeee86000a..e11be71af61 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -35,34 +35,43 @@ namespace {
 
 constexpr char kInsertOpName[] = "LatencyStatsDataset";
 
-NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
-  NodeDef new_node;
-  new_node.set_op(kInsertOpName);
+// Creates a LatencyStatsDataset node whose input is `node`.
+Status MakeLatencyNode(const NodeDef& node, MutableGraphView* graph,
+                       NodeDef* result) {
+  result->set_op(kInsertOpName);
   graph_utils::SetUniqueGraphNodeName(strings::StrCat(kInsertOpName),
-                                      graph->graph(), &new_node);
+                                      graph->graph(), result);
   // Set the input of LatencyDataset node as `node`
-  new_node.add_input(node.name());
+  result->add_input(node.name());
 
   string tag_name = strings::StrCat("record_latency",
                                     data::stats_utils::kDelimiter, node.name());
   NodeDef* tag = graph_utils::AddScalarConstNode<StringPiece>(
       StringPiece(tag_name), graph);
-  new_node.add_input(tag->name());
+  result->add_input(tag->name());
 
-  // Set `output_types` and `output_shapes` attributes.
+  // Set `output_types` and `output_shapes` attributes by copying the relevant
+  // attrs from the input node. This is an imperfect heuristic; some dataset ops
+  // might not have these attrs. If we encounter such an op, return an error
+  // instead of creating a node.
   for (auto key : {"output_shapes", "output_types"}) {
     if (node.attr().find(key) != node.attr().end()) {
-      (*new_node.mutable_attr())[key] = node.attr().at(key);
+      (*result->mutable_attr())[key] = node.attr().at(key);
     } else {
       const char* kInferredAttrPrefix = "T";
       if (node.attr().find(strings::StrCat(kInferredAttrPrefix, key)) !=
           node.attr().end()) {
-        (*new_node.mutable_attr())[key] =
+        (*result->mutable_attr())[key] =
             node.attr().at(strings::StrCat(kInferredAttrPrefix, key));
+      } else {
+        return errors::InvalidArgument(
+            "Could not create LatencyStatsDataset after ", node.op(),
+            " node because it does not have a (T)output_types or output_shapes "
+            "attr.");
       }
     }
   }
-  return new_node;
+  return Status::OK();
 }
 
 }  // namespace
@@ -83,9 +92,19 @@ Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
       // node corresponds to a `Dataset` op.
       continue;
     }
-    NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
-    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), latency_node->name()));
-    stats->num_changes++;
+    NodeDef latency_node;
+    // Try to make a latency node. This may fail if the input node doesn't have
+    // output_types or output_shapes attrs. In those cases, we don't add a node
+    // after `node`.
+    Status s = MakeLatencyNode(node, &graph, &latency_node);
+    if (s.ok()) {
+      NodeDef* latency_node_pointer = graph.AddNode(std::move(latency_node));
+      TF_RETURN_IF_ERROR(
+          graph.UpdateFanouts(node.name(), latency_node_pointer->name()));
+      stats->num_changes++;
+    } else {
+      LOG(WARNING) << s.error_message();
+    }
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index 9e3a09b5d79..0619895a552 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -384,7 +384,11 @@ Status EraseOutputShapeAttrs(TransposeContext* context) {
   utils::Mutation* mutation = graph_view->GetMutationBuilder();
   const int num_nodes = graph_view->NumNodes();
   for (int i = 0; i < num_nodes; ++i) {
-    mutation->RemoveNodeAttr(graph_view->GetNode(i), kAttrOutputShape);
+    auto* node = graph_view->GetNode(i);
+    if (IsArg(*node->node())) {
+      continue;
+    }
+    mutation->RemoveNodeAttr(node, kAttrOutputShape);
     TF_RETURN_IF_ERROR(mutation->Apply());
   }
   return Status::OK();
@@ -392,6 +396,10 @@ Status EraseOutputShapeAttrs(TransposeContext* context) {
 
 }  // namespace
 
+// When there is a GPU, the computation graph is converted to NCHW format.
+// When there is only CPU, there will be no conversion by default, unless user
+// chose to convert the graph to a desired format. Currently, NCHW -> NHWC
+// format conversion is available on CPU.
 Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
                                         const GrapplerItem& item,
                                         GraphDef* output) {
@@ -402,22 +410,37 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
   }
   const auto num_gpus_and_num_volta = GetNumGPUs(*cluster);
   const int num_gpus = num_gpus_and_num_volta.first;
-  if (num_gpus < 1) {
-    return errors::Aborted(
-        "No GPUs found: GenericLayoutOptimizer is currently only tuned for "
-        "GPU.");
-  }
 
   const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
 
   TransposeContext context;
-  TF_RETURN_IF_ERROR(
-      TransposeContext::InitializeTransposeContext(item, cluster, &context));
+  if (num_gpus > 0) {
+    TF_RETURN_IF_ERROR(
+        TransposeContext::InitializeTransposeContext(item, cluster, &context));
 
-  const auto src_dst_formats =
-      GetSrcAndDstDataFormats(context, num_gpus, num_gpus_and_num_volta.second);
-  context.AssignDeviceAndDataFormats(kGPU, src_dst_formats.first,
-                                     src_dst_formats.second);
+    const auto src_dst_formats = GetSrcAndDstDataFormats(
+        context, num_gpus, num_gpus_and_num_volta.second);
+    context.AssignDeviceAndDataFormats(kGPU, src_dst_formats.first,
+                                       src_dst_formats.second);
+  } else {
+    TF_RETURN_IF_ERROR(
+        TransposeContext::InitializeTransposeContext(item, cluster, &context));
+    switch (cpu_layout_conversion_) {
+      case RewriterConfig::NCHW_TO_NHWC:
+        context.AssignDeviceAndDataFormats(kCPU, kNCHW, kNHWC);
+        break;
+      // TODO(intel-tf): Add functionality for NHWC_TO_NCHW layout conversion on
+      // CPU.
+      case RewriterConfig::NHWC_TO_NCHW:
+        return errors::Aborted(
+            "Conversion from NHWC to NCHW is currently not  available for "
+            "CPU.");
+      default:
+        *output = item.graph;
+        VLOG(2) << "No layout conversion will take place for CPU.";
+        return Status::OK();
+    }
+  }
 
   TransposerFactory transposer_factory;
   TF_RETURN_IF_ERROR(ExpandLayoutSensitiveOp(&context, &transposer_factory));
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
index d4d61bed70c..35ddad35555 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h
@@ -25,9 +25,15 @@ namespace grappler {
 // Optimize the data layout for convolutional models.
 class GenericLayoutOptimizer : public GraphOptimizer {
  public:
-  GenericLayoutOptimizer() : GenericLayoutOptimizer(RewriterConfig::DEFAULT) {}
+  GenericLayoutOptimizer()
+      : GenericLayoutOptimizer(RewriterConfig::DEFAULT,
+                               RewriterConfig::NO_CONVERSION_ON_CPU) {}
   explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : GenericLayoutOptimizer(opt_level,
+                               RewriterConfig::NO_CONVERSION_ON_CPU) {}
+  explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level,
+                                  RewriterConfig::CpuLayout layout_conversion)
+      : opt_level_(opt_level), cpu_layout_conversion_(layout_conversion) {}
   ~GenericLayoutOptimizer() override = default;
 
   string name() const override { return "layout"; };
@@ -42,6 +48,7 @@ class GenericLayoutOptimizer : public GraphOptimizer {
 
  private:
   RewriterConfig::Toggle opt_level_;
+  RewriterConfig::CpuLayout cpu_layout_conversion_;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index 79bedf5f2e6..8307b37407e 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -49,6 +49,38 @@ constexpr int kDepthIn = 8;
 constexpr int kKernel = 3;
 constexpr int kDepthOut = 16;
 
+// When there is a GPU, we test generic_layout_optimization for the conversion
+// from NHWC to NCHW format. When there is only CPU, we test the conversion
+// from NCHW to NHWC format. The following macros help setting tensor shapes,
+// source and destination format strings, and transpose permutation vectors
+// appropriately for NHWC -> NCHW conversion (when GPU) and NCHW -> NHWC
+// conversion (when only CPU).
+
+#if (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+#define DIMS(n, h, w, c) \
+  { n, h, w, c }
+#define SRC_DATA_FORMAT "NHWC"
+#define DST_DATA_FORMAT "NCHW"
+#define DEVICE "GPU"
+#define REWRITER_CONFIG \
+  RewriterConfig::DEFAULT, RewriterConfig::NO_CONVERSION_ON_CPU
+#define PERMUTATION_SRC_TO_DST \
+  { 0, 3, 1, 2 }
+#define PERMUTATION_DST_TO_SRC \
+  { 0, 2, 3, 1 }
+#else
+#define DIMS(n, h, w, c) \
+  { n, c, h, w }
+#define SRC_DATA_FORMAT "NCHW"
+#define DST_DATA_FORMAT "NHWC"
+#define DEVICE "CPU"
+#define REWRITER_CONFIG RewriterConfig::DEFAULT, RewriterConfig::NCHW_TO_NHWC
+#define PERMUTATION_SRC_TO_DST \
+  { 0, 2, 3, 1 }
+#define PERMUTATION_DST_TO_SRC \
+  { 0, 3, 1, 2 }
+#endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+
 Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                     const string& padding, const string& device) {
   int batch_size = 8;
@@ -57,7 +89,8 @@ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
   int input_depth = 3;
   int filter_count = 2;
   int stride = 1;
-  TensorShape input_shape({batch_size, input_height, input_width, input_depth});
+  TensorShape input_shape(
+      DIMS(batch_size, input_height, input_width, input_depth));
   Tensor input_data(DT_FLOAT, input_shape);
   test::FillIota<float>(&input_data, 1.0f);
   Output input =
@@ -71,7 +104,8 @@ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
       ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
   Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
-                            filter, {1, stride, stride, 1}, padding);
+                            filter, DIMS(1, stride, stride, 1), padding,
+                            ops::Conv2D::Attrs().DataFormat(SRC_DATA_FORMAT));
   return conv;
 }
 
@@ -87,8 +121,8 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   TensorShape input_sizes_shape({input_sizes_length});
   Tensor input_data(DT_INT32, input_sizes_shape);
   if (input_sizes_length == 4) {
-    test::FillValues<int>(&input_data,
-                          {batch_size, input_height, input_width, input_depth});
+    test::FillValues<int>(
+        &input_data, DIMS(batch_size, input_height, input_width, input_depth));
   } else {
     test::FillValues<int>(&input_data, {input_height, input_width});
   }
@@ -103,7 +137,7 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   int output_height = input_height;
   int output_width = input_width;
   TensorShape output_shape(
-      {batch_size, output_height, output_width, filter_count});
+      DIMS(batch_size, output_height, output_width, filter_count));
   Tensor output_data(DT_FLOAT, output_shape);
   test::FillIota<float>(&output_data, 1.0f);
   Output output =
@@ -113,12 +147,13 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
   Output input_sizes_i =
       ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
   ops::Conv2DBackpropInput::Attrs attrs;
+  attrs = attrs.DataFormat(SRC_DATA_FORMAT);
   if (dilated) {
-    attrs = attrs.Dilations({1, 2, 2, 1});
+    attrs = attrs.Dilations(DIMS(1, 2, 2, 1));
   }
   conv_backprop_input = ops::Conv2DBackpropInput(
       s->WithOpName("Conv2DBackpropInput"), input_sizes_i, filter, output,
-      {1, stride, stride, 1}, padding, attrs);
+      DIMS(1, stride, stride, 1), padding, attrs);
 
   return conv_backprop_input;
 }
@@ -141,11 +176,18 @@ class GenericLayoutOptimizerTest : public GrapplerTest {
       cpu_device.set_l2_cache_size(256 * 1024);
       cpu_device.set_l3_cache_size(4 * 1024 * 1024);
       cpu_device.set_memory_size(1024 * 1024);
+#if (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
       DeviceProperties gpu_device;
       gpu_device.set_type("GPU");
       gpu_device.mutable_environment()->insert({"architecture", "6"});
-      virtual_cluster_ = absl::WrapUnique(
-          new VirtualCluster({{"/CPU:0", cpu_device}, {"/GPU:1", gpu_device}}));
+      virtual_cluster_ =
+          absl::WrapUnique(new VirtualCluster({{"/CPU:0", cpu_device},
+                                               { "/GPU:1",
+                                                 gpu_device }}));
+#else
+      virtual_cluster_ =
+          absl::WrapUnique(new VirtualCluster({{"/CPU:0", cpu_device}}));
+#endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
     }
     TF_ASSERT_OK(virtual_cluster_->Provision());
   }
@@ -183,10 +225,8 @@ void VerifyDataFormatAttributeMatch(const utils::NodeView* node,
 }
 
 TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  // A simple graph contains 1 "NHWC" Conv2D node, 2 input and 1 output nodes.
+  // A simple graph contains 1 Conv2D node, 2 input and 1 output nodes.
+  // Data format is NHWC on GPU, while NCHW on CPU.
   Scope scope = Scope::NewRootScope();
 
   auto conv2d = SimpleConv2D(&scope, 4, 2, "VALID", "");
@@ -194,7 +234,7 @@ TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
   GrapplerItem item;
   TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -202,9 +242,11 @@ TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
   utils::GraphView graph_view(&output, &status);
   TF_ASSERT_OK(status);
   // The expected optimized graph contains 2 extra sets of Transpose nodes and
-  // has the Conv2D's data_format set to "NCHW".
-  auto* input_transpose_node =
-      graph_view.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  // has the Conv2D's data_format set to "NCHW" on GPU, while "NHWC" on CPU.
+  auto* input_transpose_node = graph_view.GetNode(
+      absl::StrCat("Conv2D-0-Transpose", SRC_DATA_FORMAT, "To", DST_DATA_FORMAT,
+                   "-LayoutOptimizer"));
+
   ASSERT_NE(input_transpose_node, nullptr);
   ASSERT_EQ(input_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(input_transpose_node, 0, "Input", 0);
@@ -214,10 +256,11 @@ TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
   ASSERT_EQ(conv2d_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(conv2d_node, 0, input_transpose_node->GetName(), 0);
   VerifyRegularFaninMatch(conv2d_node, 1, "Filter", 0);
-  VerifyDataFormatAttributeMatch(conv2d_node, "NCHW");
+  VerifyDataFormatAttributeMatch(conv2d_node, DST_DATA_FORMAT);
 
-  auto* output_transpose_node =
-      graph_view.GetNode("Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  auto* output_transpose_node = graph_view.GetNode(
+      absl::StrCat("Conv2D-0-0-Transpose", DST_DATA_FORMAT, "To",
+                   SRC_DATA_FORMAT, "-LayoutOptimizer"));
   ASSERT_NE(output_transpose_node, nullptr);
   ASSERT_EQ(output_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(output_transpose_node, 0, conv2d_node->GetName(), 0);
@@ -236,7 +279,7 @@ TEST_F(GenericLayoutOptimizerTest, PreserveFetch) {
   item.fetch.push_back("Conv2D");
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -245,20 +288,17 @@ TEST_F(GenericLayoutOptimizerTest, PreserveFetch) {
   TF_ASSERT_OK(status);
   auto* conv_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv_node, nullptr);
-  VerifyDataFormatAttributeMatch(conv_node, "NHWC");
+  VerifyDataFormatAttributeMatch(conv_node, SRC_DATA_FORMAT);
 }
 
 TEST_F(GenericLayoutOptimizerTest, EmptyDevice) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID", "");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -267,7 +307,7 @@ TEST_F(GenericLayoutOptimizerTest, EmptyDevice) {
   TF_ASSERT_OK(status);
   auto* conv_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv_node, nullptr);
-  VerifyDataFormatAttributeMatch(conv_node, "NCHW");
+  VerifyDataFormatAttributeMatch(conv_node, DST_DATA_FORMAT);
 }
 
 TEST_F(GenericLayoutOptimizerTest, GPUDevice) {
@@ -294,16 +334,13 @@ TEST_F(GenericLayoutOptimizerTest, GPUDevice) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID", "/CPU:0");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -312,15 +349,17 @@ TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
   TF_ASSERT_OK(status);
   auto* conv_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv_node, nullptr);
+#if (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   VerifyDataFormatAttributeMatch(conv_node, "NHWC");
+#else
+  VerifyDataFormatAttributeMatch(conv_node, DST_DATA_FORMAT);
+#endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
 }
 
 TEST_F(GenericLayoutOptimizerTest, Connectivity) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   Scope scope = Scope::NewRootScope();
-  auto conv = SimpleConv2D(&scope, 4, 2, "VALID", "/device:GPU:0");
+  auto conv = SimpleConv2D(&scope, 4, 2, "VALID",
+                           absl::StrCat("/device:", DEVICE, ":0"));
   auto i1 = ops::Identity(scope.WithOpName("i1"), conv);
   auto i2 = ops::Identity(scope.WithOpName("i2"), i1);
   auto i3 = ops::Identity(scope.WithOpName("i3"), i2);
@@ -337,7 +376,7 @@ TEST_F(GenericLayoutOptimizerTest, Connectivity) {
   const int i2_index = graph_view_original.GetNode("i2")->node_index();
   item.graph.mutable_node()->SwapElements(i1_index, i2_index);
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -353,9 +392,6 @@ TEST_F(GenericLayoutOptimizerTest, Connectivity) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   for (const int input_sizes_length : {2, 4}) {
     Scope s = Scope::NewRootScope();
     auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
@@ -364,7 +400,7 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
     GrapplerItem item;
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-    GenericLayoutOptimizer optimizer;
+    GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -376,10 +412,13 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
     ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
     VerifyRegularFaninMatch(
         conv2d_backprop_node, 0,
-        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer",
+        absl::StrCat("Conv2DBackpropInput-0-DataFormatVecPermute",
+                     SRC_DATA_FORMAT, "To", DST_DATA_FORMAT,
+                     "-LayoutOptimizer"),
         0);
-    auto* input_sizes_node = graph_view.GetNode(
-        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer");
+    auto* input_sizes_node = graph_view.GetNode(absl::StrCat(
+        "Conv2DBackpropInput-0-DataFormatVecPermute", SRC_DATA_FORMAT, "To",
+        DST_DATA_FORMAT, "-LayoutOptimizer"));
     ASSERT_NE(input_sizes_node, nullptr);
     EXPECT_EQ(input_sizes_node->GetOp(), "DataFormatVecPermute");
     ASSERT_EQ(input_sizes_node->NumRegularFanins(), 1);
@@ -388,11 +427,10 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  Scope scope = Scope::NewRootScope().WithDevice("/device:GPU:0");
-  auto conv = SimpleConv2D(&scope, 4, 2, "VALID", "/device:GPU:0");
+  Scope scope =
+      Scope::NewRootScope().WithDevice(absl::StrCat("/device:", DEVICE, ":0"));
+  auto conv = SimpleConv2D(&scope, 4, 2, "VALID",
+                           absl::StrCat("/device:", DEVICE, ":0"));
   auto shape = ops::Shape(scope.WithOpName("shape"), conv);
   auto value = ops::Const(scope.WithOpName("value"), 0, {});
   auto fill = ops::Fill(scope.WithOpName("fill"), shape, value);
@@ -400,7 +438,7 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
   GrapplerItem item;
   TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -418,8 +456,11 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
   auto* conv2d_node = graph_view.GetNode("Conv2D");
   ASSERT_NE(conv2d_node, nullptr);
   ASSERT_EQ(conv2d_node->NumRegularFanins(), 2);
-  VerifyRegularFaninMatch(conv2d_node, 0,
-                          "Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer", 0);
+  VerifyRegularFaninMatch(
+      conv2d_node, 0,
+      absl::StrCat("Conv2D-0-Transpose", SRC_DATA_FORMAT, "To", DST_DATA_FORMAT,
+                   "-LayoutOptimizer"),
+      0);
 
   auto* shape_node = graph_view.GetNode("shape");
   ASSERT_NE(shape_node, nullptr);
@@ -430,50 +471,59 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
   ASSERT_NE(fill_node, nullptr);
   ASSERT_EQ(fill_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(fill_node, 0, shape_node->GetName(), 0);
-  VerifyRegularFanoutMatch(fill_node, 0,
-                           "fill-0-0-TransposeNCHWToNHWC-LayoutOptimizer", 0);
+  VerifyRegularFanoutMatch(
+      fill_node, 0,
+      absl::StrCat("fill-0-0-Transpose", DST_DATA_FORMAT, "To", SRC_DATA_FORMAT,
+                   "-LayoutOptimizer"),
+      0);
 
   auto* graph_output = graph_view.GetNode("i");
   ASSERT_NE(graph_output, nullptr);
   ASSERT_EQ(graph_output->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(graph_output, 0,
-                          "fill-0-0-TransposeNCHWToNHWC-LayoutOptimizer", 0);
+  VerifyRegularFaninMatch(
+      graph_output, 0,
+      absl::StrCat("fill-0-0-Transpose", DST_DATA_FORMAT, "To", SRC_DATA_FORMAT,
+                   "-LayoutOptimizer"),
+      0);
 }
 
 TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   {
-    Scope scope = Scope::NewRootScope().WithDevice("/device:GPU:0");
-    auto input =
-        ops::RandomUniform(scope.WithOpName("input"),
-                           {kBatchSize, kHeight, kWidth, kDepthIn}, DT_FLOAT);
-    // NHWC -> NCHW: {0, 3, 1, 2}
+    Scope scope = Scope::NewRootScope().WithDevice(
+        absl::StrCat("/device:", DEVICE, ":0"));
+    auto input = ops::RandomUniform(scope.WithOpName("input"),
+                                    DIMS(kBatchSize, kHeight, kWidth, kDepthIn),
+                                    DT_FLOAT);
+    // Permuation for source to destination data format.
+    // GPU: NHWC -> NCHW: {0, 3, 1, 2}
+    // CPU: NCHW -> NHWC: {0, 2, 3, 1}
     auto input_in_transpose =
         ops::Transpose(scope.WithOpName("input_in_transpose"), input,
-                       ops::Const(scope, {0, 3, 1, 2}, {4}));
-    // NCHW -> NHWC: {0, 2, 3, 1}
+                       ops::Const(scope, PERMUTATION_SRC_TO_DST, {4}));
+    // Permuation for destination to source data format.
+    // GPU: NCHW -> NHWC: {0, 2, 3, 1}
+    // CPU: NHWC -> NCHW: {0, 3, 1, 2}
     auto input_out_transpose = ops::Transpose(
         scope.WithOpName("input_out_transpose"), input_in_transpose,
-        ops::Const(scope, {0, 2, 3, 1}, {4}));
+        ops::Const(scope, PERMUTATION_DST_TO_SRC, {4}));
     Tensor bias_data(DT_FLOAT, TensorShape({kDepthIn}));
     test::FillIota<float>(&bias_data, 1.0f);
-    auto bias_add = ops::BiasAdd(scope.WithOpName("bias_add"),
-                                 input_out_transpose, bias_data);
+    auto bias_add = ops::BiasAdd(
+        scope.WithOpName("bias_add"), input_out_transpose, bias_data,
+        ops::BiasAdd::Attrs().DataFormat(SRC_DATA_FORMAT));
     auto output_in_transpose =
         ops::Transpose(scope.WithOpName("output_in_transpose"), bias_add,
-                       ops::Const(scope, {0, 3, 1, 2}, {4}));
+                       ops::Const(scope, PERMUTATION_SRC_TO_DST, {4}));
     auto output_out_transpose = ops::Transpose(
         scope.WithOpName("output_out_transpose"), output_in_transpose,
-        ops::Const(scope, {0, 2, 3, 1}, {4}));
+        ops::Const(scope, PERMUTATION_DST_TO_SRC, {4}));
     auto output =
         ops::Identity(scope.WithOpName("output"), output_out_transpose);
     TF_ASSERT_OK(scope.ToGraphDef(&item.graph));
   }
 
-  GenericLayoutOptimizer optimizer;
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -495,8 +545,9 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
   VerifyRegularFaninMatch(input_out_transpose_node, 0,
                           input_in_transpose_node->GetName(), 0);
 
-  auto* bias_add_in_transpose_node =
-      graph_view.GetNode("bias_add-0-TransposeNHWCToNCHW-LayoutOptimizer");
+  auto* bias_add_in_transpose_node = graph_view.GetNode(
+      absl::StrCat("bias_add-0-Transpose", SRC_DATA_FORMAT, "To",
+                   DST_DATA_FORMAT, "-LayoutOptimizer"));
   ASSERT_NE(bias_add_in_transpose_node, nullptr);
   ASSERT_EQ(bias_add_in_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(bias_add_in_transpose_node, 0,
@@ -508,8 +559,9 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
   VerifyRegularFaninMatch(bias_add_node, 0,
                           bias_add_in_transpose_node->GetName(), 0);
 
-  auto* bias_add_out_transpose_node =
-      graph_view.GetNode("bias_add-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  auto* bias_add_out_transpose_node = graph_view.GetNode(
+      absl::StrCat("bias_add-0-0-Transpose", DST_DATA_FORMAT, "To",
+                   SRC_DATA_FORMAT, "-LayoutOptimizer"));
   ASSERT_NE(bias_add_out_transpose_node, nullptr);
   ASSERT_EQ(bias_add_out_transpose_node->NumRegularFanins(), 2);
   VerifyRegularFaninMatch(bias_add_out_transpose_node, 0,
@@ -537,7 +589,9 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
 TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) {
   using test::function::NDef;
 
-  GenericLayoutOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GenericLayoutOptimizer optimizer(
+      RewriterConfig::AGGRESSIVE,
+      RewriterConfig::NCHW_TO_NHWC /* CPU settings*/);
 
   const Tensor kPermuteNhwcToNchw = test::AsTensor<int32>({0, 3, 1, 2});
   const Tensor kPermuteNchwToNhwc = test::AsTensor<int32>({0, 2, 3, 1});
@@ -601,6 +655,32 @@ TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) {
   test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(GenericLayoutOptimizerTest, PreserveInputShapes) {
+  using test::function::NDef;
+
+  GenericLayoutOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  AttrValue output_shapes;
+  auto* shape = output_shapes.mutable_list()->add_shape();
+  shape->add_dim()->set_size(-1);
+
+  GrapplerItem item;
+  item.graph = test::function::GDef({NDef(
+      "x", "_Arg", {},
+      {{"T", DT_FLOAT}, {"index", 0}, {"_output_shapes", output_shapes}})});
+
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  Status status;
+  utils::GraphView graph_view(&output, &status);
+  TF_ASSERT_OK(status);
+
+  auto* arg = graph_view.GetNode("x");
+  ASSERT_NE(arg, nullptr);
+  EXPECT_TRUE(arg->HasAttr("_output_shapes"));
+}
+
 // TODO(yanzha): Add more complex Graph for test.
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index 7741730db59..61720df791b 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -41,6 +41,7 @@ constexpr char kAttrSrcFormat[] = "src_format";
 constexpr char kAttrDstFormat[] = "dst_format";
 constexpr char kAttrOutputShape[] = "_output_shapes";
 constexpr char kGPU[] = "GPU";
+constexpr char kCPU[] = "CPU";
 
 // TransposeContext owns all data members. Must initialize GraphProperties,
 // FrameView, GraphDef and MutableGraphView with the same graph. NodeDef
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index bce86ba5603..0c4c04633dc 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -191,7 +191,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
              cfg_.experimental_disable_compressed_tensor_optimization()));
   MK_OPT("shape", new ShapeOptimizer());
   MK_OPT("remap", new Remapper(cfg_.remapping()));
-  MK_OPT("layout", new GenericLayoutOptimizer());
+  MK_OPT("layout", new GenericLayoutOptimizer(
+                       /*optimization level*/ cfg_.layout_optimizer(),
+                       /*CPU layout conversion*/ cfg_.cpu_layout_conversion()));
   MK_OPT("auto_mixed_precision",
          new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
   MK_OPT("auto_mixed_precision_mkl",
@@ -271,7 +273,9 @@ Status MetaOptimizer::InitializeOptimizers(
         MakeUnique<ArithmeticOptimizer>(cfg_.arithmetic_optimization()));
   }
   if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
-    optimizers->push_back(MakeUnique<GenericLayoutOptimizer>());
+    optimizers->push_back(MakeUnique<GenericLayoutOptimizer>(
+        /*optimization level*/ cfg_.layout_optimizer(),
+        /*CPU layout conversion*/ cfg_.cpu_layout_conversion()));
   }
   if (cfg_.remapping() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 31444735b20..fd3f8ee89f5 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1916569b542..581109b2382 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -490,7 +490,7 @@ cc_library(
     name = "gpu_prim_hdrs",
     hdrs = ["gpu_prim.h"],
     deps = if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
@@ -1653,7 +1653,10 @@ tf_cuda_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
-    tags = ["no_cuda11"],  # b/159664089
+    tags = [
+        "no_cuda11",  # b/159664089
+        "no_oss",
+    ],
     deps = [
         ":conv_ops",
         ":ops_testutil",
@@ -3095,8 +3098,6 @@ LOGGING_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
-    # TODO(b/162630222): remove this dependency.
-    "//tensorflow/c/kernels:summary_op",
 ]
 
 tf_kernel_library(
@@ -3121,7 +3122,9 @@ tf_kernel_library(
     name = "summary_op",
     prefix = "summary_op",
     deps = LOGGING_DEPS + [
+        # TODO(b/162630222): remove these dependencies.
         "//tensorflow/c/kernels:histogram_summary_op",
+        "//tensorflow/c/kernels:summary_op",
     ],
 )
 
@@ -3896,7 +3899,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ] + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
         "@local_config_cuda//cuda:cudnn_header",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
@@ -3986,7 +3989,7 @@ tf_kernel_library(
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
         "//tensorflow/core:stream_executor",
         "//tensorflow/stream_executor/cuda:cuda_stream",
     ]) + if_rocm([
@@ -4708,7 +4711,7 @@ tf_kernel_library(
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
@@ -7533,3 +7536,32 @@ tf_kernel_library(
     name = "einsum_op",
     deps = ["//tensorflow/core/kernels/linalg:einsum_op"],
 )
+
+tf_kernel_library(
+    name = "isotonic_regression_op",
+    srcs = [
+        "isotonic_regression_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "isotonic_regression_op_test",
+    size = "small",
+    srcs = ["isotonic_regression_op_test.cc"],
+    deps = [
+        ":isotonic_regression_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 04071505294..5f742c37f35 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -31,6 +32,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+auto* batch_op_split_usage = monitoring::Gauge<string, 1>::New(
+    "/tensorflow/serving/batching/enable_large_batch_splitting",
+    "Tracks the usage of attribute `enable_large_batch_splitting` for "
+    "BatchFunction kernel in a saved model.",
+    "model_name");
+
+void RecordBatchSplitUsage(
+    absl::optional<bool> maybe_enable_large_batch_splitting,
+    const string& model_name) {
+  if (maybe_enable_large_batch_splitting.has_value()) {
+    if (maybe_enable_large_batch_splitting.value()) {
+      batch_op_split_usage->GetCell(model_name)->Set("true");
+    } else {
+      batch_op_split_usage->GetCell(model_name)->Set("false");
+    }
+  } else {
+    batch_op_split_usage->GetCell(model_name)->Set("unset");
+  }
+}
+
+const string& GetModelName(OpKernelContext* ctx) {
+  static string* kModelNameUnset = new string("model_name_unset");
+  if (!ctx->session_metadata()) return *kModelNameUnset;
+  if (ctx->session_metadata()->name().empty()) return *kModelNameUnset;
+  return ctx->session_metadata()->name();
+}
+
 using ::tensorflow::concat_split_util::Concat;
 using ::tensorflow::concat_split_util::Split;
 
@@ -130,8 +158,10 @@ class BatchFunctionKernel : public AsyncOpKernel {
     if (c->HasAttr("enable_large_batch_splitting")) {
       OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
                                    &enable_large_batch_splitting_));
+      has_attribute_enable_large_batch_splitting_ = true;
     } else {
       enable_large_batch_splitting_ = false;
+      has_attribute_enable_large_batch_splitting_ = false;
     }
 
     OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
@@ -140,6 +170,11 @@ class BatchFunctionKernel : public AsyncOpKernel {
   bool IsExpensive() override { return false; }
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    RecordBatchSplitUsage(
+        has_attribute_enable_large_batch_splitting_
+            ? absl::make_optional(enable_large_batch_splitting_)
+            : absl::nullopt,
+        GetModelName(c));
     BatchResource* br;
     std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
@@ -198,6 +233,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
   bool enable_large_batch_splitting_;
+  bool has_attribute_enable_large_batch_splitting_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index fedea93849c..356c857616b 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -425,7 +425,7 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
     return;
   }
   auto best_it = batches_.end();
-  double best_score;
+  double best_score = (std::numeric_limits<double>::max)();
   int64 now_micros = GetEnv()->NowMicros();
   for (auto it = batches_.begin(); it != batches_.end(); it++) {
     if ((*it)->schedulable_time_micros() > now_micros) continue;
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 6922158413d..994f27ffe8e 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 
 // clang-format off
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 #include <math.h>  // NOLINT
 #include <algorithm>  // NOLINT
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index ce4aca1cdcc..00456d9a1dd 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -83,6 +84,8 @@ class NcclTestBase : public ::testing::Test {
   NcclTestBase(CollectiveType collective_type, const string& collective_name)
       : collective_type_(collective_type),
         collective_name_(collective_name),
+        work_queue_(std::make_shared<UnboundedWorkQueue>(
+            Env::Default(), "collective_executor")),
         col_exec_(nullptr) {}
 
   ~NcclTestBase() override {
@@ -118,7 +121,7 @@ class NcclTestBase : public ::testing::Test {
       dev_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(local_devices));
     col_exec_ = new BaseCollectiveExecutor(
         &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
-        /*gpu_ring_order=*/nullptr);
+        /*gpu_ring_order=*/nullptr, work_queue_);
 
     // Initialize collective params.
     col_params_.name = "test_nccl_collective_op";
@@ -413,6 +416,7 @@ class NcclTestBase : public ::testing::Test {
   const string collective_name_;
   std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
   TestCollectiveExecutorMgr col_exec_mgr_;
+  std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CollectiveExecutor* col_exec_;
   std::unique_ptr<DeviceMgr> dev_mgr_;
   std::vector<std::unique_ptr<DeviceInstance>> instances_;
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index b16d3c7270f..a923df5c477 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -301,7 +301,6 @@ class Conv2DBackpropFilterOp : public OpKernel {
                                               /*num_dims=*/4, data_format_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<Device, CPUDevice>::value) {
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 86090864ddb..158f93fdec1 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -426,7 +426,6 @@ class Conv2DBackpropInputOp : public OpKernel {
                                               /*num_dims=*/4, data_format_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<Device, CPUDevice>::value ||
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 322da2537f0..8f811138823 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1394,7 +1394,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     const int device_id = stream->parent()->device_ordinal();
-    DataType dtype = context->input(0).dtype();
+    // To make sure the Conv3DBackpropInputV2 get the correct dtype, we infer
+    // the dtype from 2nd input, i.e., out_backprop.
+    DataType dtype = context->input(2).dtype();
     const ConvParameters conv_parameters = {
         dims.batch_size,
         dims.in_depth,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 8db796c216b..ca2abce0b15 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -512,7 +512,6 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index fb31fc14a7c..f838d05decf 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -670,7 +670,6 @@ class FusedConv2DOp : public OpKernel {
     OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     using FCT = FusedComputationType;
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index c0aee43d268..9920da3f163 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index cfeb63a4242..96c7e036e03 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -73,7 +74,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     const auto& input_shapes = input_->output_shapes();
     output_shapes_.reserve(input_shapes.size());
     for (const auto& input_shape : input_shapes) {
-      if (drop_remainder_) {
+      if (drop_remainder_ || input_->Cardinality() == kInfiniteCardinality) {
         output_shapes_.emplace_back(
             PartialTensorShape({batch_size_}).Concatenate(input_shape));
       } else {
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 68b3ea552fc..46e724c5d22 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -264,11 +264,6 @@ class InstantiatedCapturedFunction {
 };
 
 }  // namespace data
-
-// TODO(b/114112161): Remove these aliases when all users have moved over to the
-// `tensorflow::data` namespace.
-using data::CapturedFunction;
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index e41e35be1e9..14af07fe494 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -64,12 +64,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 #include "tensorflow/core/kernels/data/take_dataset_op.h"
 #include "tensorflow/core/kernels/data/tensor_slice_dataset_op.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index d79288b86d3..d0c493b8d59 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -52,6 +52,7 @@ constexpr std::array<const char*, 3> kOpsWithSeed = {
 
 constexpr char kSeedInputName[] = "seed";
 constexpr char kSeed2InputName[] = "seed2";
+constexpr char kSeedGeneratorInputName[] = "seed_generator";
 constexpr char kComponent[] = "component";
 constexpr char kNumElements[] = "num_elements";
 constexpr char kNumComponents[] = "num_components";
@@ -60,7 +61,9 @@ template <std::size_t SIZE>
 bool IsNodeOfType(const NodeDef& node,
                   const std::array<const char*, SIZE>& op_types) {
   for (const auto& type : op_types) {
-    if (node.op() == type) return true;
+    if (MatchesAnyVersion(type, node.op())) {
+      return true;
+    }
   }
   return false;
 }
@@ -111,7 +114,8 @@ Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
       if (reg->op_def.input_arg_size() > i) {
         const std::string input_arg_name = reg->op_def.input_arg(i).name();
         if (input_arg_name == kSeedInputName ||
-            input_arg_name == kSeed2InputName) {
+            input_arg_name == kSeed2InputName ||
+            input_arg_name == kSeedGeneratorInputName) {
           VLOG(2) << "Ignoring arg: " << input_arg_name
                   << " from node: " << node.name();
           *result = true;
@@ -899,10 +903,18 @@ std::string DeterminismPolicy::String() const {
   }
 }
 
-bool MatchesAnyVersionRE(StringPiece op_prefix, StringPiece op_to_match) {
-  // Matches all versions of an op by appending an optional version suffix
-  auto expected_re = strings::StrCat(RE2::QuoteMeta(op_prefix), "(V\\d+)?");
-  return RE2::FullMatch(op_to_match, expected_re);
+bool MatchesAnyVersion(StringPiece op_prefix, StringPiece op_to_match) {
+  if (!absl::StartsWith(op_to_match, op_prefix)) {
+    return false;
+  }
+  if (op_to_match.length() == op_prefix.length()) {
+    return true;
+  }
+  size_t index = op_to_match.length() - 1;
+  while (isdigit(op_to_match[index])) {
+    index--;
+  }
+  return (op_to_match[index] == 'V') && (op_prefix.length() == index);
 }
 
 std::vector<tstring> SelectOptimizations(
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 7f9ea923b98..cefa388a29e 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -297,12 +297,12 @@ class DummyResourceOp : public OpKernel {
 };
 
 // Given an op prefix and an op to match, returns whether the op to match
-// is a regex match for any version of the op prefix. For example,
-// MatchesAnyVersionRE("BatchDataset", "BatchDataset") == true
-// MatchesAnyVersionRE("BatchDataset", "BatchDatasetV2") == true
-// MatchesAnyVersionRE("BatchDataset", "BatchDatasetV3") == true
-// MatchesAnyVersionRE("PaddedBatchDataset", "BatchDataset") == false
-bool MatchesAnyVersionRE(StringPiece op_prefix, StringPiece op_to_match);
+// is a match for any version of the op prefix. For example,
+// MatchesAnyVersion("BatchDataset", "BatchDataset") == true
+// MatchesAnyVersion("BatchDataset", "BatchDatasetV2") == true
+// MatchesAnyVersion("BatchDataset", "BatchDatasetV3") == true
+// MatchesAnyVersion("PaddedBatchDataset", "BatchDataset") == false
+bool MatchesAnyVersion(StringPiece op_prefix, StringPiece op_to_match);
 
 // Based on `job_name`, `optimizations_enabled`, `optimizations_disabled` and
 // `optimizations_default`, returns the list of optimizations that will be
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 85019e3f8da..f1c5e7b1edb 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -66,11 +66,12 @@ string full_name(string key) {
 }
 
 TEST(DatasetUtilsTest, MatchesAnyVersion) {
-  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDataset"));
-  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDatasetV2"));
-  EXPECT_TRUE(MatchesAnyVersionRE("BatchDataset", "BatchDatasetV3"));
-  EXPECT_FALSE(MatchesAnyVersionRE("BatchDataset", "BatchV2Dataset"));
-  EXPECT_FALSE(MatchesAnyVersionRE("BatchDataset", "PaddedBatchDataset"));
+  EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDataset"));
+  EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDatasetV2"));
+  EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDatasetV3"));
+  EXPECT_FALSE(MatchesAnyVersion("BatchDataset", "BatchDatasetXV3"));
+  EXPECT_FALSE(MatchesAnyVersion("BatchDataset", "BatchV2Dataset"));
+  EXPECT_FALSE(MatchesAnyVersion("BatchDataset", "PaddedBatchDataset"));
 }
 
 TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index cb8dc67d6dd..7348b342c6a 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -97,8 +97,8 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
       }
       int n = tokens.size();
       for (size_t i = 0; i < dataset()->transformations_.size(); ++i) {
-        if (!MatchesAnyVersionRE(dataset()->transformations_[i],
-                                 tokens[n - 2 - i])) {
+        if (!MatchesAnyVersion(dataset()->transformations_[i],
+                               tokens[n - 2 - i])) {
           return errors::InvalidArgument("Asserted transformation matching ",
                                          dataset()->transformations_[i],
                                          " at offset ", i, " but encountered ",
diff --git a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
index 1c4c5dea248..87cfaff5e5f 100644
--- a/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
+++ b/tensorflow/core/kernels/data/experimental/compute_batch_size_op.cc
@@ -65,7 +65,7 @@ template <std::size_t SIZE>
 bool IsDatasetNodeOfType(const NodeDef& node,
                          const std::array<const char*, SIZE>& arr) {
   for (const auto& dataset_op : arr) {
-    if (MatchesAnyVersionRE(dataset_op, node.op())) return true;
+    if (MatchesAnyVersion(dataset_op, node.op())) return true;
   }
   return false;
 }
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 8a160aa8502..1c354153ec2 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -187,11 +187,19 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
     ~Iterator() override {
       VLOG(1) << "Destroying data service dataset iterator for job id "
-              << job_id_;
+              << job_client_id_;
       CancelThreads();
       if (deregister_fn_) deregister_fn_();
-      // Thread destructors will block until the threads finish, no need to wait
-      // here.
+      task_thread_manager_.reset();
+      if (initialized_) {
+        Status s = dispatcher_->ReleaseJobClient(job_client_id_);
+        if (!s.ok()) {
+          LOG(WARNING) << "Failed to release job client id: " << s;
+        }
+      }
+      for (auto& worker_thread : worker_threads_) {
+        worker_thread.reset();
+      }
     }
 
     void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
@@ -209,27 +217,28 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(RegisterCancellationCallback(
           ctx->cancellation_manager(), [this]() { CancelThreads(); },
           &deregister_fn_));
-      DataServiceDispatcherClient dispatcher(dataset()->address_,
-                                             dataset()->protocol_);
+      dispatcher_ = absl::make_unique<DataServiceDispatcherClient>(
+          dataset()->address_, dataset()->protocol_);
       int64 deadline_micros = ctx->env()->NowMicros() + kRetryTimeoutMicros;
       if (dataset()->job_name_.empty()) {
         TF_RETURN_IF_ERROR(grpc_util::Retry(
             [&]() {
-              return dispatcher.CreateJob(dataset()->dataset_id_,
-                                          dataset()->processing_mode_,
-                                          &job_id_);
+              return dispatcher_->CreateJob(dataset()->dataset_id_,
+                                            dataset()->processing_mode_,
+                                            &job_client_id_);
             },
             "create job", deadline_micros));
       } else {
         TF_RETURN_IF_ERROR(grpc_util::Retry(
             [&]() {
-              return dispatcher.GetOrCreateJob(
+              return dispatcher_->GetOrCreateJob(
                   dataset()->dataset_id_, dataset()->processing_mode_,
-                  dataset()->job_name_, iterator_index_, &job_id_);
+                  dataset()->job_name_, iterator_index_, &job_client_id_);
             },
             "get or create job", deadline_micros));
       }
-      VLOG(1) << "Created data service job with id " << job_id_;
+      initialized_ = true;
+      VLOG(1) << "Created data service job with id " << job_client_id_;
       return Status::OK();
     }
 
@@ -310,8 +319,6 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       auto cleanup =
           gtl::MakeCleanup([] { VLOG(1) << "Task thread manager exiting"; });
       VLOG(1) << "Starting task thread manager";
-      DataServiceDispatcherClient dispatcher(dataset()->address_,
-                                             dataset()->protocol_);
       uint64 next_check = Env::Default()->NowMicros();
       while (true) {
         {
@@ -329,22 +336,21 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             return;
           }
         }
-        UpdateTasks(&dispatcher);
+        UpdateTasks();
         UpdateWorkerThreads(ctx.get());
         next_check = Env::Default()->NowMicros() +
                      dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTasks(DataServiceDispatcherClient* dispatcher)
-        LOCKS_EXCLUDED(mu_) {
+    void UpdateTasks() LOCKS_EXCLUDED(mu_) {
       VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
-      Status s = dispatcher->GetTasks(job_id_, &tasks, &job_finished);
+      Status s = dispatcher_->GetTasks(job_client_id_, &tasks, &job_finished);
       if (!s.ok()) {
-        LOG(WARNING) << "Failed to get task info for job id " << job_id_ << ": "
-                     << s;
+        LOG(WARNING) << "Failed to get task info for job client id "
+                     << job_client_id_ << ": " << s;
         return;
       }
       absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
@@ -577,15 +583,13 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     Status status_ TF_GUARDED_BY(mu_) = Status::OK();
     std::queue<std::vector<Tensor>> results_ TF_GUARDED_BY(mu_);
 
+    bool initialized_ = false;
     // Set once in Initialize().
-    int64 job_id_;
+    int64 job_client_id_;
+    std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
 
     bool job_finished_ = false;
-    // Must be ordered second to last so that worker threads are joined before
-    // destroying other fields.
     std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
-    // Must be ordered last so that the thread is joined before destroying other
-    // fields.
     std::unique_ptr<Thread> task_thread_manager_ GUARDED_BY(mu_);
   };
 
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index f151ad5cdab..24d4934c56d 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -82,9 +82,11 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
     string job_name = port::JobName();
     // The map that stores the experiment names and for how much percentage
     // of the jobs, the experiments will be randomly turned on.
-    //
-    // This is currently empty; we have no live experiments yet.
-    absl::flat_hash_map<string, uint64> live_experiments;
+    // clang-format off
+    absl::flat_hash_map<string, uint64> live_experiments = {
+        {"disable_intra_op_parallelism", 1}
+    };
+    // clang-format on
     auto hash_func = [](const string& str) { return Hash64(str); };
     optimizations = SelectOptimizations(
         job_name, live_experiments, optimizations_enabled,
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index a35fb2c3952..fd0a1855206 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -76,7 +76,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     const auto& input_shapes = input_->output_shapes();
     output_shapes_.reserve(input_shapes.size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
-      if (drop_remainder_) {
+      if (drop_remainder_ || input_->Cardinality() == kInfiniteCardinality) {
         output_shapes_.push_back(
             PartialTensorShape({batch_size_}).Concatenate(padded_shapes_[i]));
       } else {
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index e7480ca24d3..b0c4a6589cc 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -57,11 +57,12 @@ namespace data {
 
 namespace {
 
+constexpr char kComponent[] = "component";
 constexpr char kInvocationResults[] = "invocation_results";
-constexpr char kSizeSuffix[] = ".size";
-constexpr char kEndOfInputSuffix[] = ".end_of_input";
-constexpr char kCodeSuffix[] = ".code";
-constexpr char kErrorMessage[] = ".error_message";
+constexpr char kSize[] = "size";
+constexpr char kEndOfInput[] = "end_of_input";
+constexpr char kErrorCode[] = "code";
+constexpr char kErrorMessage[] = "error_message";
 
 // Period between reporting dataset statistics.
 constexpr int kStatsReportingPeriodMillis = 1000;
@@ -274,27 +275,25 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             "Unexpected outstanding calls encountered.");
       }
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      TF_RETURN_IF_ERROR(writer->WriteScalar(
-          full_name(strings::StrCat(kInvocationResults, kSizeSuffix)),
-          invocation_results_.size()));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(absl::StrCat(prefix(), "::", kInvocationResults),
+                              kSize, invocation_results_.size()));
       for (size_t i = 0; i < invocation_results_.size(); i++) {
         const auto& result = *(invocation_results_[i]);
-        TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result.status));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(
-                strings::StrCat(kInvocationResults, "[", i, "]", kSizeSuffix)),
-            result.return_values.size()));
+        std::string element_prefix =
+            absl::StrCat(prefix(), "::", kInvocationResults, "::", i);
+        TF_RETURN_IF_ERROR(
+            WriteStatusLocked(writer, element_prefix, result.status));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(element_prefix, kSize,
+                                               result.return_values.size()));
         for (size_t j = 0; j < result.return_values.size(); j++) {
           TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(
-                  strings::StrCat(kInvocationResults, "[", i, "][", j, "]")),
+              element_prefix, absl::StrCat(kComponent, "[", j, "]"),
               result.return_values[j]));
         }
         if (result.end_of_input) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(kInvocationResults, "[", i, "]",
-                                        kEndOfInputSuffix)),
-              ""));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(element_prefix, kEndOfInput, ""));
         }
       }
       return Status::OK();
@@ -305,39 +304,36 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(*mu_);
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       int64 invocation_results_size;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(
-          full_name(strings::StrCat(kInvocationResults, kSizeSuffix)),
-          &invocation_results_size));
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(absl::StrCat(prefix(), "::", kInvocationResults),
+                             kSize, &invocation_results_size));
       if (!invocation_results_.empty()) invocation_results_.clear();
       for (size_t i = 0; i < invocation_results_size; i++) {
         invocation_results_.push_back(std::make_shared<InvocationResult>());
         auto& result = *invocation_results_.back();
-        TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result.status));
+        std::string element_prefix =
+            absl::StrCat(prefix(), "::", kInvocationResults, "::", i);
+        TF_RETURN_IF_ERROR(
+            ReadStatusLocked(reader, element_prefix, &result.status));
         size_t num_return_values;
         {
           int64 size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(kInvocationResults, "[", i, "]",
-                                        kSizeSuffix)),
-              &size));
+          TF_RETURN_IF_ERROR(reader->ReadScalar(element_prefix, kSize, &size));
           num_return_values = static_cast<size_t>(size);
           if (num_return_values != size) {
-            return errors::InvalidArgument(strings::StrCat(
-                full_name(strings::StrCat(kInvocationResults, "[", i, "]",
-                                          kSizeSuffix)),
-                ": ", size, " is not a valid value of type size_t."));
+            return errors::InvalidArgument(
+                element_prefix, ",", kSize, ": ", size,
+                " is not a valid value of type size_t.");
           }
         }
         result.return_values.reserve(num_return_values);
         for (size_t j = 0; j < num_return_values; j++) {
           result.return_values.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(full_name(strings::StrCat(
-                                     kInvocationResults, "[", i, "][", j, "]")),
-                                 &result.return_values.back()));
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              element_prefix, absl::StrCat(kComponent, "[", j, "]"),
+              &result.return_values.back()));
         }
-        result.end_of_input = reader->Contains(full_name(strings::StrCat(
-            kInvocationResults, "[", i, "]", kEndOfInputSuffix)));
+        result.end_of_input = reader->Contains(element_prefix, kEndOfInput);
         result.notification.Notify();
       }
       return Status::OK();
@@ -592,28 +588,28 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       }
     }
 
-    Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
-                             const Status& status)
+    Status WriteStatusLocked(IteratorStateWriter* writer,
+                             const std::string& key, const Status& status)
         TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
-          CodeKey(index), static_cast<int64>(status.code())));
+          key, kErrorCode, static_cast<int64>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                               status.error_message()));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(key, kErrorMessage, status.error_message()));
       }
       return Status::OK();
     }
 
-    Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
+    Status ReadStatusLocked(IteratorStateReader* reader, const std::string& key,
                             Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64 code_int;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(key, kErrorCode, &code_int));
       error::Code code = static_cast<error::Code>(code_int);
 
       if (code != error::Code::OK) {
         tstring error_message;
         TF_RETURN_IF_ERROR(
-            reader->ReadScalar(ErrorMessageKey(index), &error_message));
+            reader->ReadScalar(key, kErrorMessage, &error_message));
         *status = Status(code, error_message);
       } else {
         *status = Status::OK();
@@ -621,16 +617,6 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       return Status::OK();
     }
 
-    string CodeKey(size_t index) {
-      return full_name(
-          strings::StrCat(kInvocationResults, "[", index, "]", kCodeSuffix));
-    }
-
-    string ErrorMessageKey(size_t index) {
-      return full_name(
-          strings::StrCat(kInvocationResults, "[", index, "]", kErrorMessage));
-    }
-
     // Used for coordination between the main thread and the runner thread.
     const std::shared_ptr<mutex> mu_;
     // Used for coordination between the main thread and the runner thread. In
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 03c9525a7ab..d54ea63099b 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -123,26 +123,39 @@ class ShardDatasetOp::Dataset : public DatasetBase {
                            bool* end_of_sequence) override {
       mutex_lock l(mu_);
 
+      *end_of_sequence = false;
       if (!input_impl_) {
         *end_of_sequence = true;
         return Status::OK();
       }
 
+      int num_to_skip =
+          (dataset()->index_ - next_index_) % dataset()->num_shards_;
+      if (num_to_skip < 0) {
+        num_to_skip += dataset()->num_shards_;
+      }
+      int num_skipped;
+      TF_RETURN_IF_ERROR(
+          input_impl_->Skip(ctx, num_to_skip, end_of_sequence, &num_skipped));
+      next_index_ += num_skipped;
+      if (*end_of_sequence) {
+        input_impl_.reset();
+        return Status::OK();
+      }
+
       std::vector<Tensor> result;
-      do {
-        result.clear();
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
-        if (*end_of_sequence) {
-          input_impl_.reset();
-          return Status::OK();
-        }
-      } while ((next_index_++ % dataset()->num_shards_) != dataset()->index_);
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
+      if (*end_of_sequence) {
+        input_impl_.reset();
+        return Status::OK();
+      }
+      next_index_++;
 
-      while (dataset()->require_non_empty_ &&
-             next_index_ < dataset()->num_shards_) {
-        std::vector<Tensor> unused_result;
-
-        Status s = input_impl_->GetNext(ctx, &unused_result, end_of_sequence);
+      if (dataset()->require_non_empty_ &&
+          next_index_ < dataset()->num_shards_) {
+        int num_skipped;
+        Status s = input_impl_->Skip(ctx, dataset()->num_shards_ - next_index_,
+                                     end_of_sequence, &num_skipped);
         if (*end_of_sequence || errors::IsOutOfRange(s)) {
           return errors::InvalidArgument(
               "There aren't enough elements in this dataset for each shard to "
@@ -156,7 +169,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
           return s;
         }
 
-        next_index_++;
+        next_index_ = dataset()->num_shards_;
       }
 
       *out_tensors = std::move(result);
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 952d5cae97b..9f1e99cd915 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -140,21 +140,16 @@ class SkipDatasetOp::Dataset : public DatasetBase {
         return Status::OK();
       }
 
-      // Keep calling GetNext().  TODO(vrv): Figure out a way to
-      // skip records without reading, perhaps by adding an
-      // interface to iterator.
-      while (i_ < dataset()->count_) {
-        // Fetch and throw away Tensors.
-        std::vector<Tensor> dummy_out_tensors;
-        TF_RETURN_IF_ERROR(
-            input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence));
+      if (i_ < dataset()->count_) {
+        int num_skipped;
+        TF_RETURN_IF_ERROR(input_impl_->Skip(ctx, dataset()->count_ - i_,
+                                             end_of_sequence, &num_skipped));
+        i_ += num_skipped;
         if (*end_of_sequence) {
           // We reached the end before the count was reached.
           input_impl_.reset();
           return Status::OK();
         }
-
-        ++i_;
       }
 
       // Return GetNext() on the underlying iterator.
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 498cd6146a8..0b256a062c2 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <numeric>
 
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
diff --git a/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/core/kernels/depthtospace_op.cc
index 20169d0f4b4..acc13773b8c 100644
--- a/tensorflow/core/kernels/depthtospace_op.cc
+++ b/tensorflow/core/kernels/depthtospace_op.cc
@@ -117,14 +117,13 @@ class DepthToSpaceOp : public OpKernel {
         // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
         auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
         auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
-        functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
+        functor::DepthToSpaceOpFunctor<Device, int32, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), Tinput_v, block_size_,
                 Toutput_v);
         return;
       } else if (data_format_ == FORMAT_NCHW) {
-        functor::DepthToSpaceOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,
-                Toutput);
+        functor::DepthToSpaceOpFunctor<Device, T, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
         return;
       }
     }
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index b809e1d1065..9a613c1d845 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -579,8 +579,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     dtype_ = DataTypeToEnum<T>::value;
 #if CUDNN_VERSION >= 8000
@@ -638,13 +636,13 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -674,7 +672,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
               "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
                 stride_, stride_, padding_, explicit_paddings_, in_backprop,
                 data_format_);
@@ -701,7 +699,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DBackpropInputOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
@@ -1085,8 +1082,6 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<T, Eigen::half>::value) {
@@ -1138,13 +1133,13 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -1175,7 +1170,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
+                input,
                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
                 padding_, explicit_paddings_, &reshaped_filter, data_format_);
       return;
@@ -1234,7 +1230,6 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index fe6a9e3e377..a03de90007f 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -298,8 +298,6 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     dtype_ = DataTypeToEnum<T>::value;
 #if CUDNN_VERSION >= 8000
@@ -407,13 +405,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // TODO(csigg): Have autotune decide if native is faster than cuDNN.
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNative: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -443,7 +441,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
               "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, input,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
                 stride_, stride_, padding_, explicit_paddings_, output,
                 data_format_);
@@ -485,7 +483,6 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 3b38daf0067..5393a677db2 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace {
 enum {
diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h
index 82fcb21e0ac..33c5df1ae23 100644
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@@ -15,19 +15,19 @@ limitations under the license, the license you must see.
 #define TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
 
 #if GOOGLE_CUDA
-#include "third_party/cub/block/block_load.cuh"
-#include "third_party/cub/block/block_scan.cuh"
-#include "third_party/cub/block/block_store.cuh"
-#include "third_party/cub/device/device_histogram.cuh"
-#include "third_party/cub/device/device_radix_sort.cuh"
-#include "third_party/cub/device/device_reduce.cuh"
-#include "third_party/cub/device/device_segmented_radix_sort.cuh"
-#include "third_party/cub/device/device_segmented_reduce.cuh"
-#include "third_party/cub/device/device_select.cuh"
-#include "third_party/cub/iterator/counting_input_iterator.cuh"
-#include "third_party/cub/iterator/transform_input_iterator.cuh"
-#include "third_party/cub/thread/thread_operators.cuh"
-#include "third_party/cub/warp/warp_reduce.cuh"
+#include "cub/block/block_load.cuh"
+#include "cub/block/block_scan.cuh"
+#include "cub/block/block_store.cuh"
+#include "cub/device/device_histogram.cuh"
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_reduce.cuh"
+#include "cub/device/device_segmented_radix_sort.cuh"
+#include "cub/device/device_segmented_reduce.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/counting_input_iterator.cuh"
+#include "cub/iterator/transform_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "cub/warp/warp_reduce.cuh"
 #include "third_party/gpus/cuda/include/cusparse.h"
 
 namespace gpuprim = ::cub;
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 7a6924e2ebf..461fb7deb78 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -414,7 +414,7 @@ TEST(GraphTransferer,
 
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
   ClockCycleProfiler prof;
   prof.Start();
   Status status = gt.LoadGraphFromProtoFile(
@@ -447,7 +447,7 @@ TEST(GraphTransferer,
 
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
   ClockCycleProfiler prof;
   prof.Start();
   Status status = gt.LoadGraphFromProtoFile(
@@ -481,7 +481,7 @@ TEST(GraphTransferer,
 
   GraphTransferer gt;
   gt.EnableStrictCheckMode(false);
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
   ClockCycleProfiler prof;
   prof.Start();
   Status status = gt.LoadGraphFromProtoFile(
@@ -540,7 +540,7 @@ TEST(GraphTransferer, DISABLED_RunInceptionV3OnHexagonExampleWithFusedGraph) {
 
 TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
   CheckHexagonControllerVersion();
-  profile_utils::CpuUtils::EnableClockCycleProfiling(true);
+  profile_utils::CpuUtils::EnableClockCycleProfiling();
 
   const IRemoteFusedGraphOpsDefinitions* ops_definitions =
       &HexagonOpsDefinitions::getInstance();
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index f7ad9ab0371..0d69a384f73 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -276,7 +276,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "resize_bilinear_op",
     prefix = "resize_bilinear_op",
-    deps = IMAGE_DEPS,
+    deps = IMAGE_DEPS + ["//tensorflow/core/kernels:cast_op"],
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op.cc b/tensorflow/core/kernels/image/resize_bilinear_op.cc
index b9eb650c029..b84c7aaddbd 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op.cc
@@ -16,6 +16,10 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #include "tensorflow/core/kernels/image/resize_bilinear_op.h"
 
 #ifdef __SSE4_1__
@@ -30,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/image_resizer_state.h"
@@ -281,6 +286,25 @@ void resize_image(typename TTypes<T, 4>::ConstTensor images,
   }
 }
 
+template <typename Device>
+struct CastFloatToHalf {
+  void operator()(const Device& d, typename TTypes<float>::ConstFlat input,
+                  typename TTypes<Eigen::half>::Flat output) {
+    output.device(d) = input.template cast<Eigen::half>();
+  }
+};
+
+template <>
+struct CastFloatToHalf<GPUDevice> {
+  void operator()(const GPUDevice& d, typename TTypes<float>::ConstFlat input,
+                  typename TTypes<Eigen::half>::Flat output) {
+    // Use existing cast functor instead of directly casting Eigen tensor, as
+    // otherwise we need to instantiate the cast function in a .cu.cc file
+    functor::CastFunctor<GPUDevice, Eigen::half, float> cast;
+    cast(d, output, input);
+  }
+};
+
 }  // namespace
 
 // Partial specialization of ResizeBilinear functor for a CPUDevice.
@@ -355,11 +379,29 @@ class ResizeBilinearOpGrad : public OpKernel {
     if (!context->status().ok()) return;
 
     TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
-    typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
 
-    functor::ResizeBilinearGrad<Device, T>()(
-        context->eigen_device<Device>(), input_grad, st.height_scale,
-        st.width_scale, half_pixel_centers_, output_grad);
+    if (!std::is_same<T, Eigen::half>::value) {
+      typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());
+      functor::ResizeBilinearGrad<Device, T>()(
+          context->eigen_device<Device>(), input_grad, st.height_scale,
+          st.width_scale, half_pixel_centers_, output_grad);
+    } else {
+      // Accumulate output to float instead of half tensor, since float
+      // accumulation is more numerically stable and GPU half implementation is
+      // slow.
+      // TODO(b/165759037): Create optimized and numerically stable half
+      // implementation
+      Tensor output_grad;
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DT_FLOAT, st.output->shape(), &output_grad));
+      functor::ResizeBilinearGrad<Device, float>()(
+          context->eigen_device<Device>(), input_grad, st.height_scale,
+          st.width_scale, half_pixel_centers_, output_grad.tensor<float, 4>());
+      const Tensor& output_grad_const = output_grad;
+      CastFloatToHalf<Device>{}(context->template eigen_device<Device>(),
+                                output_grad_const.template flat<float>(),
+                                st.output->template flat<Eigen::half>());
+    }
   }
 
  private:
@@ -479,7 +521,7 @@ TF_CALL_double(REGISTER_GRAD_KERNEL);
                               .HostMemory("size"),    \
                           ResizeBilinearOp<GPUDevice, T>);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
@@ -488,7 +530,7 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
       Name("ResizeBilinearGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       ResizeBilinearOpGrad<GPUDevice, T>);
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GRAD_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GRAD_KERNEL);
 
 #undef REGISTER_GRAD_KERNEL
 
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
index aa475a4a3af..c8dfe754060 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@@ -442,13 +442,17 @@ struct ResizeBilinearGrad<GPUDevice, T> {
   }
 };
 
-#define DEFINE_GPU_SPECS(T)                     \
-  template struct ResizeBilinear<GPUDevice, T>; \
+#define DEFINE_GPU_SPEC(T) template struct ResizeBilinear<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+
+#define DEFINE_GRAD_GPU_SPEC(T) \
   template struct ResizeBilinearGrad<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GRAD_GPU_SPEC);
 
-#undef DEFINE_GPU_SPECS
+#undef DEFINE_GPU_SPEC
+#undef DEFINE_GRAD_GPU_SPEC
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/isotonic_regression_op.cc b/tensorflow/core/kernels/isotonic_regression_op.cc
new file mode 100644
index 00000000000..d2a310e56c3
--- /dev/null
+++ b/tensorflow/core/kernels/isotonic_regression_op.cc
@@ -0,0 +1,226 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cmath>
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace {
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+// The # of ops estimated for the isotonic regression solver is the size of the
+// array multiplied by this constant. This is used by the thread pool executor
+// when deciding how many threads to use.
+constexpr int kCostMultiplier = 100;
+
+// In separable chain-constrained problems, i.e., those of the form
+//
+//  min_{y_1 >= y_2 >= ... >= y_n} \sum_{i=1}^n h_i(y_i)
+//
+// for any set of convex functions h_i, of particular importance are contiguous
+// segments of coordinates, which this class represents. The interval is assumed
+// to be half-closed and equal to [col_start(), col_limit()).
+class Segment {
+ public:
+  // Creates the [col_index, col_index+1).
+  explicit Segment(int col_index)
+      : col_start_(col_index), col_limit_(col_index + 1) {}
+
+  // Returns the number of points in the segment.
+  int num_points() const { return col_limit_ - col_start_; }
+
+  // Merge another segment into this one.
+  void merge_with(const Segment& other) {
+    col_start_ = std::min(col_start_, other.col_start());
+    col_limit_ = std::max(col_limit_, other.col_limit());
+  }
+
+  int col_start() const { return col_start_; }
+
+  int col_limit() const { return col_limit_; }
+
+ private:
+  int col_start_;
+  int col_limit_;
+};
+
+// If we can solve for each segment {j, j+1, ..., j+m} the interval problem
+//
+//  argmin_y \sum_{i=j}^{j+m} h_i(y),
+//
+// we can use such an oracle to solve the general problem. The following class
+// implements such an oracle for the case when h_i is the squared (l2) loss,
+// or formally h_i(y) = (y - x_i)^2, where x_i is the i-th input.
+//
+// TODO(josipd): We know how and can extend this to other functions if needed.
+template <typename T>
+class L2PavaSegment : public Segment {
+ public:
+  L2PavaSegment(T y, int col_index)
+      : Segment(col_index), y_sum_(y), minimum_(y) {}
+
+  void merge_with(const L2PavaSegment& other) {
+    Segment::merge_with(other);
+    y_sum_ += other.y_sum_;
+    minimum_ = y_sum_ / static_cast<T>(num_points());
+  }
+
+  T minimum() const { return minimum_; }
+
+ private:
+  T y_sum_;    // The sum of the inputs within the segment.
+  T minimum_;  // The minimum, cached to avoid expensive divisions.
+};
+
+// Solve one of the problems in the batch (the row_index'th one) using the
+// pool-adjacent violators algorithm (PAVA).
+//
+// The PAVA algorithm goes back to
+//
+// Nonmetric Multidimensional Scaling: A numerical method
+// Kruskal, J. B. (1964), Psychometrika (1964)
+//
+// For a more recent analysis, please refer to
+//
+// Active set algorithms for isotonic regression; a unifying framework
+// Best, Michael J., and Nilotpal Chakravarti
+// Mathematical Programming 47.1-3 (1990)
+//
+// Intuitively, the algorithm splits the inputs into blocks (starting from
+// singleton ones), and then whenever there are two consecutive blocks whose
+// minima violate the inequality constraint, they are merged. The solution is
+// then block-wise constant, each block equal to the corresponding minimum.
+//
+// The tensors should be two dimensional, and the segment objects should
+// support the minimum() and merge_with() methods.
+template <typename SegmentType, typename FloatTensor, typename IntTensor>
+void solve_pava(const std::function<SegmentType(int, int)>& make_segment,
+                FloatTensor* solution, IntTensor* segments, int row_index) {
+  const size_t n = solution->dimensions()[1];
+  std::vector<SegmentType> pools;
+  pools.reserve(n);
+
+  for (size_t col_index = 0; col_index < n; ++col_index) {
+    pools.push_back(make_segment(row_index, col_index));
+
+    // While the last two pools are decreasing, merge them.
+    while (pools.size() > 1 &&
+           pools.rbegin()->minimum() > (pools.rbegin() + 1)->minimum()) {
+      (pools.rbegin() + 1)->merge_with(*pools.rbegin());
+      pools.pop_back();
+    }
+  }
+
+  int segment_id = 0;
+  for (const auto& pool : pools) {
+    const auto pool_minimum = pool.minimum();
+    // The matrices are row major, so we can scan the memory linearly.
+    auto* solution_ptr = &(*solution)(row_index, pool.col_start());
+    auto* segments_ptr = &(*segments)(row_index, pool.col_start());
+    for (int i = pool.col_start(); i < pool.col_limit(); ++i) {
+      *solution_ptr++ = pool_minimum;
+      *segments_ptr++ = segment_id;
+    }
+    ++segment_id;
+  }
+}
+
+// Solve a batch of problems using the pool-adjacent violators algorithm.
+// The problems are solved in parallel using tensorflow's thread pool.
+template <typename SegmentType, typename FloatTensor, typename IntTensor>
+void solve_pava_batch(const std::function<SegmentType(int, int)>& make_segment,
+                      FloatTensor* solution, IntTensor* segments,
+                      tensorflow::OpKernelContext* context) {
+  const int batch_size = solution->dimensions()[0];
+  const int problem_size = solution->dimensions()[1];
+
+  auto thread_pool =
+      context->device()->tensorflow_cpu_worker_threads()->workers;
+
+  thread_pool->ParallelFor(
+      batch_size, kCostMultiplier * problem_size,
+      [&make_segment, &solution, &segments](int64 row_start, int64 row_limit) {
+        // Casting to int is safe, as we do boundary checks in `Compute`.
+        for (int row_index = static_cast<int>(row_start);
+             row_index < static_cast<int>(row_limit); ++row_index) {
+          solve_pava(make_segment, solution, segments, row_index);
+        }
+      });
+}
+
+}  // namespace
+
+template <typename Tin, typename Tout>
+class IsotonicRegressionOp : public tensorflow::OpKernel {
+ public:
+  explicit IsotonicRegressionOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // Grab the input tensor.
+    const tensorflow::Tensor& input_tensor = context->input(0);
+    const auto input = input_tensor.flat_inner_dims<Tin, 2>();
+    int int_max = std::numeric_limits<int32>::max();
+    OP_REQUIRES(context,
+                tensorflow::FastBoundsCheck(input.dimensions()[0], int_max) &&
+                    tensorflow::FastBoundsCheck(input.dimensions()[1], int_max),
+                tensorflow::errors::InvalidArgument("Tensor too large"));
+
+    // Create the output tensor holding the minimizers.
+    const auto shape = input_tensor.shape();
+    tensorflow::Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {0}, 0, shape, &output_tensor));
+    auto output = output_tensor->flat_inner_dims<Tout, 2>();
+
+    // Create the output tensor holidng the segment memberships.
+    tensorflow::Tensor* segments_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, shape, &segments_tensor));
+    auto segments = segments_tensor->flat_inner_dims<int>();
+
+    auto make_l2_segment = [&input](int row_index, int col_index) {
+      return L2PavaSegment<Tout>(input(row_index, col_index), col_index);
+    };
+    solve_pava_batch<L2PavaSegment<Tout>>(make_l2_segment, &output, &segments,
+                                          context);
+  }
+};
+
+#define REGISTER_CPU_KERNEL(Tin, Tout)                               \
+  REGISTER_KERNEL_BUILDER(Name("IsotonicRegression")                 \
+                              .Device(tensorflow::DEVICE_CPU)        \
+                              .TypeConstraint<Tin>("T")              \
+                              .TypeConstraint<Tout>("output_dtype"), \
+                          IsotonicRegressionOp<Tin, Tout>);
+
+// Float types have the same input and output.
+#define REGISTER_CPU_SAME_KERNEL(T) REGISTER_CPU_KERNEL(T, T)
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_SAME_KERNEL);
+
+// 8 and 16 bit integers get converted to 32 bit floats.
+#define REGISTER_CPU_KERNEL_FLOAT(Tin) REGISTER_CPU_KERNEL(Tin, float)
+TF_CALL_int16(REGISTER_CPU_KERNEL_FLOAT);
+TF_CALL_int8(REGISTER_CPU_KERNEL_FLOAT);
+
+// 32 and 64 bit integers get converted to 64 bit floats.
+#define REGISTER_CPU_KERNEL_DOUBLE(Tin) REGISTER_CPU_KERNEL(Tin, double)
+TF_CALL_int64(REGISTER_CPU_KERNEL_DOUBLE);
+TF_CALL_int32(REGISTER_CPU_KERNEL_DOUBLE);
diff --git a/tensorflow/core/kernels/isotonic_regression_op_test.cc b/tensorflow/core/kernels/isotonic_regression_op_test.cc
new file mode 100644
index 00000000000..dcba9001f5b
--- /dev/null
+++ b/tensorflow/core/kernels/isotonic_regression_op_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdio>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class IsotonicRegressionOpTest : public OpsTestBase {
+ public:
+  void MakeOp(DataType type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "IsotonicRegression")
+                     .Input(FakeInput(type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+class BenchmarkHelper : public IsotonicRegressionOpTest {
+ public:
+  void TestBody() override {}
+
+  void AddIncreasingInput(int batch_size, int input_size) {
+    std::vector<float> input_data(input_size * batch_size, 0);
+    for (int i = 0; i < input_data.size(); i++) {
+      input_data[i] = i;
+    }
+    AddInputFromArray<float>(TensorShape({batch_size, input_size}), input_data);
+  }
+};
+
+TEST_F(IsotonicRegressionOpTest, Constant) {
+  MakeOp(DT_FLOAT_REF);
+
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  test::ExpectClose(expected, *GetOutput((0)));
+}
+
+TEST_F(IsotonicRegressionOpTest, IncreasingInput) {
+  MakeOp(DT_FLOAT_REF);
+
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {2, 2, 2, 5, 5, 5, 8, 8, 8, 11, 11, 11, 14, 14, 14});
+  test::ExpectClose(expected, *GetOutput((0)));
+
+  Tensor expected_ord(allocator(), DT_INT32, TensorShape({5, 3}));
+  test::FillValues<int>(&expected_ord,
+                        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<int>(expected_ord, *GetOutput((1)));
+}
+
+TEST_F(IsotonicRegressionOpTest, Decreasing) {
+  MakeOp(DT_FLOAT_REF);
+
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  test::ExpectClose(expected, *GetOutput((0)));
+
+  Tensor expected_ord(allocator(), DT_INT32, TensorShape({5, 3}));
+  test::FillValues<int>(&expected_ord,
+                        {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2});
+  test::ExpectTensorEqual<int>(expected_ord, *GetOutput((1)));
+}
+
+#ifdef PLATFORM_GOOGLE
+
+static void BM_IncreasingSequence(benchmark::State& state) {
+  int batch_size = state.range(0);
+  int input_size = state.range(1);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    BenchmarkHelper helper;
+    helper.MakeOp(DT_FLOAT_REF);
+    helper.AddIncreasingInput(batch_size, input_size);
+    state.ResumeTiming();
+    Status stat = helper.RunOpKernel();
+  }
+  state.SetItemsProcessed(
+      static_cast<int64>(batch_size * input_size * state.iterations()));
+}
+
+BENCHMARK(BM_IncreasingSequence)
+    ->Args({1, 1 << 0})
+    ->Args({1, 1 << 5})
+    ->Args({1, 1 << 8})
+    ->Args({1, 1 << 10})
+    ->Args({1, 1 << 20})
+    ->Args({1, 2 << 20})
+    ->Args({1 << 0, 1 << 10})
+    ->Args({1 << 1, 1 << 10})
+    ->Args({1 << 4, 1 << 10})
+    ->Args({1 << 6, 1 << 10})
+    ->Args({1 << 9, 1 << 10})
+    ->Args({1 << 10, 1 << 10});
+
+#endif  // PLATFORM_GOOGLE
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 7bffb5ac547..cb757ac930b 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -175,7 +175,7 @@ class InitializeTableFromDatasetOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, GetInitializableLookupTable("table_handle", ctx, &table), done);
     core::ScopedUnref unref_me(table);
-    DatasetBase* dataset;
+    data::DatasetBase* dataset;
     OP_REQUIRES_OK_ASYNC(
         ctx, GetDatasetFromVariantTensor(ctx->input(1), &dataset), done);
     background_worker_.Schedule([ctx, dataset, table, done]() {
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index fc1e2fe2b17..d07b525a6bd 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -396,12 +396,12 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
 
 class DatasetIterator : public InitializableLookupTable::InitTableIterator {
  public:
-  explicit DatasetIterator(DatasetBase* dataset) : dataset_(dataset) {}
+  explicit DatasetIterator(data::DatasetBase* dataset) : dataset_(dataset) {}
 
   ~DatasetIterator() override {}
 
   Status Init(OpKernelContext* ctx) {
-    IteratorContext::Params params(ctx);
+    data::IteratorContext::Params params(ctx);
     function_handle_cache_ =
         absl::make_unique<data::FunctionHandleCache>(params.flr);
     params.function_handle_cache = function_handle_cache_.get();
@@ -409,7 +409,7 @@ class DatasetIterator : public InitializableLookupTable::InitTableIterator {
     cancellation_manager_ =
         absl::make_unique<CancellationManager>(ctx->cancellation_manager());
     params.cancellation_manager = cancellation_manager_.get();
-    iterator_ctx_ = absl::make_unique<IteratorContext>(std::move(params));
+    iterator_ctx_ = absl::make_unique<data::IteratorContext>(std::move(params));
     TF_RETURN_IF_ERROR(dataset_->MakeIterator(iterator_ctx_.get(), nullptr,
                                               "LookupTable", &iterator_));
     Next();
@@ -442,12 +442,12 @@ class DatasetIterator : public InitializableLookupTable::InitTableIterator {
   }
 
  private:
-  DatasetBase* dataset_;  // not owned.
-  std::unique_ptr<IteratorContext> iterator_ctx_;
+  data::DatasetBase* dataset_;  // not owned.
+  std::unique_ptr<data::IteratorContext> iterator_ctx_;
   std::unique_ptr<data::FunctionHandleCache> function_handle_cache_;
   ResourceMgr resource_mgr_;
   std::unique_ptr<CancellationManager> cancellation_manager_;
-  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<data::IteratorBase> iterator_;
   std::vector<Tensor> tensors_;
   Status status_;
 };
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 5652addd00a..36ab1d71671 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -321,30 +321,6 @@ class MaxPoolingGradOp : public OpKernel {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename T>
-static void MaxPoolingBackwardCustomKernel(
-    OpKernelContext* context, const std::vector<int32>& size,
-    const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
-    const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                              {0}, 0, tensor_in_shape, &output));
-
-  PoolParameters params{context, size,        stride,
-                        padding, FORMAT_NHWC, tensor_in_shape};
-  if (!context->status().ok()) {
-    return;
-  }
-
-  functor::MaxPoolBackwardNoMask<T>()(
-      tensor_in->flat<T>().data(), params.tensor_in_batch,
-      params.tensor_in_rows, params.tensor_in_cols, params.depth,
-      params.out_height, params.out_width, params.window_rows,
-      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
-      context->eigen_device<Eigen::GpuDevice>());
-}
-
 template <class T>
 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
  public:
@@ -372,7 +348,6 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
-    use_dnn_ = CanUseCudnn();
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -417,18 +392,10 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-
-    if (use_dnn_) {
-      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
-                                   ksize, stride, padding_, data_format_,
-                                   &tensor_in, &tensor_out, out_backprop,
-                                   output_shape, propagate_nans_);
-    } else {
-      CHECK(data_format_ == FORMAT_NHWC)
-          << "Non-Cudnn MaxPoolGrad only supports NHWC format";
-      MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
-                                        &tensor_in, out_backprop, output_shape);
-    }
+    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
+                                 stride, padding_, data_format_, &tensor_in,
+                                 &tensor_out, out_backprop, output_shape,
+                                 propagate_nans_);
   }
 
  private:
@@ -436,7 +403,6 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -1139,7 +1105,6 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    use_dnn_ = CanUseCudnn();
 
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
@@ -1165,17 +1130,15 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                     "qint8 should be used with data_format NCHW_VECT_C."));
 
 #if CUDNN_VERSION >= 7300
-    if (use_dnn_) {
-      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
+    DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                             stride_, padding_, data_format_, tensor_in,
+                             out_shape, propagate_nans_);
 #else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
-    if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
+    if (!is_int8x4 && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
                                stride_, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
-#endif
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1195,6 +1158,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                    << ") is not supported.";
       }
     }
+#endif
   }
 
  private:
@@ -1202,7 +1166,6 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -1232,7 +1195,6 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
                       "Pooling is not yet supported on the batch dimension."));
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    use_dnn_ = CanUseCudnn();
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -1275,13 +1237,13 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
     TensorShape out_shape =
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
-    if (use_dnn_ && data_format_ == FORMAT_NCHW) {
+    if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
                                stride, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
-          << "Non-Cudnn MaxPool only supports NHWC format";
+          << "MaxPool only supports NCHW or NHWC format";
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
@@ -1294,7 +1256,6 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 2a4bb9a94fe..4de2f29aa30 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -180,48 +180,6 @@ __global__ void MaxPoolForwardNHWC(
   }
 }
 
-template <typename dtype>
-__global__ void MaxPoolBackwardNoMaskNHWC(
-    const int nthreads, const dtype* __restrict__ bottom_data, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    const dtype* __restrict__ top_diff, dtype* __restrict__ bottom_diff) {
-  GPU_1D_KERNEL_LOOP(index, nthreads) {
-    // First find out the index to the maximum, since we have no mask.
-    int n = index;
-    int c = n % channels;
-    n /= channels;
-    int wstart = (n % pooled_width) * stride_w - pad_l;
-    n /= pooled_width;
-    int hstart = (n % pooled_height) * stride_h - pad_t;
-    n /= pooled_height;
-    int hend = min(hstart + kernel_h, height);
-    int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    dtype maxval = Eigen::NumTraits<dtype>::lowest();
-    int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int idx = (h * width + w) * channels + c;
-        if (bottom_data_n[idx] > maxval) {
-          maxidx = idx;
-          maxval = bottom_data_n[idx];
-        }
-      }
-    }
-
-    // Atomically accumulate the bottom diff. The index could still be
-    // uninitialized, if all the bottom_data are NaN.
-    if (maxidx != -1) {
-      GpuAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
-                   top_diff[index]);
-    }
-  }
-}
-
 // The parameters to the kernels in the backward function is as follows:
 //     nthreads: the number of threads, which is equal to the output size.
 //     top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
@@ -445,31 +403,6 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
   return d.ok();
 }
 
-template <typename T>
-bool MaxPoolBackwardNoMask<T>::operator()(
-    const T* bottom_data, const int batch, const int height, const int width,
-    const int channels, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-
-  const int bottom_size = batch * channels * height * width;
-  if (bottom_size == 0) return true;
-  TF_CHECK_OK(GpuLaunchKernel(
-      SetZero<T>, (bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-      kThreadsPerBlock, 0, d.stream(), bottom_size, bottom_diff));
-
-  const int top_size = batch * channels * pooled_height * pooled_width;
-  TF_CHECK_OK(GpuLaunchKernel(
-      MaxPoolBackwardNoMaskNHWC<T>,
-      (top_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0,
-      d.stream(), top_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_diff, bottom_diff));
-  return d.ok();
-}
-
 template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
@@ -540,7 +473,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct SpatialMaxPooling<GPUDevice, T>;     \
   template struct MaxPoolForwardWithOptionalArgmax<T>; \
   template struct MaxPoolBackwardWithArgmax<T>;        \
-  template struct MaxPoolBackwardNoMask<T>;            \
   template struct MaxPoolGradBackwardWithArgmax<T>;    \
   template struct MaxPoolGradBackwardNoMask<T>;
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 5383833b318..44ccdfd9a76 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -60,16 +60,6 @@ struct MaxPoolBackwardWithArgmax {
                   const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
-template <typename T>
-struct MaxPoolBackwardNoMask {
-  bool operator()(const T* bottom_data, const int batch, const int height,
-                  const int width, const int channels, const int pooled_height,
-                  const int pooled_width, const int kernel_h,
-                  const int kernel_w, const int stride_h, const int stride_w,
-                  const int pad_t, const int pad_l, const T* top_diff,
-                  T* bottom_diff, const Eigen::GpuDevice& d);
-};
-
 template <typename T>
 struct MaxPoolGradBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index 339ab938cca..c9bcdb57cb7 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -59,6 +59,8 @@ struct MklConvBwdFilterParams {
   memory::dims diff_bias_dims;
   memory::dims diff_dst_dims;
   memory::dims strides;
+  MKL_TENSOR_FORMAT tf_fmt;
+  bool native_format;
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
@@ -69,6 +71,7 @@ struct MklConvBwdFilterParams {
   MklConvBwdFilterParams(memory::dims src_dims, memory::dims diff_filter_dims,
                          memory::dims diff_bias_dims,
                          memory::dims diff_dst_dims, memory::dims strides,
+                         MKL_TENSOR_FORMAT tf_fmt, bool native_format,
                          memory::dims dilations, memory::dims padding_left,
 #ifndef ENABLE_MKLDNN_V1
                          memory::dims padding_right, padding_kind padding)
@@ -80,6 +83,8 @@ struct MklConvBwdFilterParams {
         diff_bias_dims(diff_bias_dims),
         diff_dst_dims(diff_dst_dims),
         strides(strides),
+        tf_fmt(tf_fmt),
+        native_format(native_format),
         dilations(dilations),
         padding_left(padding_left),
 #ifndef ENABLE_MKLDNN_V1
@@ -243,15 +248,21 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvBwdFilterParams& convBwdFilterDims) {
-    // Create memory descriptors for convolution backward filter without any
-    // specific format so that MKL-DNN can pick an appropriate one depending
-    // on the input parameters.
-    context_.src_md.reset(new memory::desc(
-        {convBwdFilterDims.src_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
+    MEMORY_FORMAT user_data_fmt;
+    if (convBwdFilterDims.native_format) {
+      user_data_fmt =
+          MklTensorFormatToMklDnnDataFormat(convBwdFilterDims.tf_fmt);
+    } else {
+      // Create memory descriptors for convolution backward filter without any
+      // specific format so that MKL-DNN can pick an appropriate one depending
+      // on the input parameters.
+      user_data_fmt = MEMORY_FORMAT::any;
+    }
+    context_.src_md.reset(new memory::desc({convBwdFilterDims.src_dims},
+                                           MklDnnType<T>(), user_data_fmt));
 
-    context_.diff_dst_md.reset(
-        new memory::desc({convBwdFilterDims.diff_dst_dims}, MklDnnType<T>(),
-                         MEMORY_FORMAT::any));
+    context_.diff_dst_md.reset(new memory::desc(
+        {convBwdFilterDims.diff_dst_dims}, MklDnnType<T>(), user_data_fmt));
 
     context_.diff_filter_md.reset(
         new memory::desc({convBwdFilterDims.diff_filter_dims}, MklDnnType<T>(),
@@ -407,6 +418,9 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convBwdFilterDims.dilations);
     key_creator.AddAsKey(convBwdFilterDims.padding_left);
     key_creator.AddAsKey(convBwdFilterDims.padding_right);
+    if (convBwdFilterDims.native_format) {
+      key_creator.AddAsKey(convBwdFilterDims.tf_fmt);
+    }
     return key_creator.GetKey();
   }
 
@@ -424,7 +438,7 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
 };
 
 template <typename Device, class T, bool bias_enabled, bool is_depthwise,
-          bool eager_mode>
+          bool native_format>
 class MklConvCustomBackpropFilterOp
     : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
@@ -441,9 +455,9 @@ class MklConvCustomBackpropFilterOp
       const Tensor& diff_dst_tensor = MklGetInput(context, kDiffDstIdx);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
-      GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode);
-      GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode);
-      GetMklShape(context, kDiffDstIdx, &diff_dst_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIdx, &src_mkl_shape, native_format);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape, native_format);
+      GetMklShape(context, kDiffDstIdx, &diff_dst_mkl_shape, native_format);
       // Allow operator-specific sanity checking of shapes.
       ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
@@ -455,7 +469,7 @@ class MklConvCustomBackpropFilterOp
       TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor);
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
       TensorShape diff_dst_tf_shape =
-          GetTfShape(context, kDiffDstIdx, eager_mode);
+          GetTfShape(context, kDiffDstIdx, native_format);
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* diff_filter_tensor = nullptr;
@@ -469,7 +483,7 @@ class MklConvCustomBackpropFilterOp
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_filter_tensor,
                                   diff_filter_tf_shape, diff_filter_mkl_shape,
-                                  eager_mode);
+                                  native_format);
         DCHECK(diff_filter_tensor != nullptr);
 
         // If output tensor has more than 0 elements, we need to 0 them out.
@@ -534,6 +548,7 @@ class MklConvCustomBackpropFilterOp
       for (int i = 0; i < dilations.size(); ++i) --dilations[i];
       MklConvBwdFilterParams convBwdFilterDims(
           fwd_src_dims, fwd_filter_dims, diff_bias_dims, diff_dst_dims, strides,
+          tf_fmt, native_format,
 #ifndef ENABLE_MKLDNN_V1
           dilations, padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
@@ -566,7 +581,7 @@ class MklConvCustomBackpropFilterOp
                diff_filter_dims[MklDnnDims::Dim_O]});
           AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
                                     diff_filter_tf_shape, diff_filter_mkl_shape,
-                                    eager_mode);
+                                    native_format);
         } else {
           // Depthwise Conv2d: diff_filter_dims is GOIHW format.
           //                  | TensorFlow       | MKLDNN
@@ -710,7 +725,7 @@ class MklConvCustomBackpropFilterOp
   TensorShape MakeInputTfShape(OpKernelContext* context,
                                const Tensor& input_tensor) {
     size_t input_idx = 0;
-    return GetTfShape(context, input_idx, eager_mode);
+    return GetTfShape(context, input_idx, native_format);
   }
 
   // Get TensorFlow shape of filter tensor.
@@ -792,7 +807,7 @@ class MklConvCustomBackpropFilterOp
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
       MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>); \
   REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklEagerConv2DBackpropFilter")                              \
+      Name("_MklNativeConv2DBackpropFilter")                             \
           .Device(DEVICE_CPU)                                            \
           .TypeConstraint<T>("T")                                        \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
index 2e700d0a627..bfac57d59eb 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
@@ -63,6 +63,8 @@ struct MklConvBwdInputParams {
   memory::dims filter_dims;
   memory::dims diff_dst_dims;
   memory::dims strides;
+  MKL_TENSOR_FORMAT tf_fmt;
+  bool native_format;
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
@@ -72,6 +74,7 @@ struct MklConvBwdInputParams {
 
   MklConvBwdInputParams(memory::dims diff_src_dims, memory::dims filter_dims,
                         memory::dims diff_dst_dims, memory::dims strides,
+                        MKL_TENSOR_FORMAT tf_fmt, bool native_format,
                         memory::dims dilations, memory::dims padding_left,
 #ifndef ENABLE_MKLDNN_V1
                         memory::dims padding_right, padding_kind padding)
@@ -82,6 +85,8 @@ struct MklConvBwdInputParams {
         filter_dims(filter_dims),
         diff_dst_dims(diff_dst_dims),
         strides(strides),
+        tf_fmt(tf_fmt),
+        native_format(native_format),
         dilations(dilations),
         padding_left(padding_left),
 #ifndef ENABLE_MKLDNN_V1
@@ -215,15 +220,22 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvBwdInputParams& convBwdInputDims) {
-    // Create memory descriptors for conv bwd input without any specified
-    // format so that MKL-DNN can pick an appropriate one depending on the
-    // input parameters.
+    MEMORY_FORMAT user_data_fmt;
+    if (convBwdInputDims.native_format) {
+      user_data_fmt =
+          MklTensorFormatToMklDnnDataFormat(convBwdInputDims.tf_fmt);
+    } else {
+      // Create memory descriptors for conv bwd input without any specified
+      // format so that MKL-DNN can pick an appropriate one depending on the
+      // input parameters.
+      user_data_fmt = MEMORY_FORMAT::any;
+    }
+    context_.diff_dst_md.reset(new memory::desc(
+        {convBwdInputDims.diff_dst_dims}, MklDnnType<T>(), user_data_fmt));
     context_.diff_src_md.reset(new memory::desc(
-        {convBwdInputDims.diff_src_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
+        {convBwdInputDims.diff_src_dims}, MklDnnType<T>(), user_data_fmt));
     context_.filter_md.reset(new memory::desc(
         {convBwdInputDims.filter_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
-    context_.diff_dst_md.reset(new memory::desc(
-        {convBwdInputDims.diff_dst_dims}, MklDnnType<T>(), MEMORY_FORMAT::any));
 
     // Create descriptors for both conv fwd and conv bwd input.
     context_.bwd_input_desc.reset(new ConvBwdDataDesc(
@@ -343,6 +355,9 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convBwdInputDims.dilations);
     key_creator.AddAsKey(convBwdInputDims.padding_left);
     key_creator.AddAsKey(convBwdInputDims.padding_right);
+    if (convBwdInputDims.native_format) {
+      key_creator.AddAsKey(convBwdInputDims.tf_fmt);
+    }
     return key_creator.GetKey();
   }
 
@@ -358,7 +373,7 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool is_depthwise, bool eager_mode>
+template <typename Device, class T, bool is_depthwise, bool native_format>
 class MklConvCustomBackpropInputOp
     : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
@@ -375,9 +390,9 @@ class MklConvCustomBackpropInputOp
       const Tensor& diff_dst_tensor = MklGetInput(context, kOutbpropIdx);
 
       MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape;
-      GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode);
-      GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode);
-      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIdx, &src_mkl_shape, native_format);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape, native_format);
+      GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, native_format);
       // Allow operator-specific sanity checking of shapes.
       ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
@@ -397,7 +412,7 @@ class MklConvCustomBackpropInputOp
 
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
       TensorShape diff_dst_tf_shape =
-          GetTfShape(context, kOutbpropIdx, eager_mode);
+          GetTfShape(context, kOutbpropIdx, native_format);
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* diff_src_tensor = nullptr;
@@ -411,7 +426,7 @@ class MklConvCustomBackpropInputOp
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_src_tensor,
                                   diff_src_tf_shape, diff_src_mkl_shape,
-                                  eager_mode);
+                                  native_format);
         DCHECK(diff_src_tensor != nullptr);
 
         // If output tensor has more than 0 elements, we need to 0 them out.
@@ -475,7 +490,8 @@ class MklConvCustomBackpropInputOp
       // 0 in MKL-DNN.
       for (int i = 0; i < dilations.size(); ++i) --dilations[i];
       MklConvBwdInputParams convBwdInputDims(
-          fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, dilations,
+          fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, tf_fmt,
+          native_format, dilations,
 #ifndef ENABLE_MKLDNN_V1
           padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
@@ -511,13 +527,11 @@ class MklConvCustomBackpropInputOp
                                      bwd_diff_src_dims, bwd_diff_src_format);
       TensorShape diff_src_tf_shape;
       diff_src_tf_shape.AddDim(diff_src_pd.get_size() / sizeof(T));
-      Tensor tmp_tensor;
-      if (eager_mode) {
-        AllocTmpBuffer<T>(context, &tmp_tensor, diff_src_tf_shape);
+      if (native_format) {
         diff_src_tf_shape = diff_src_mkl_shape.GetTfShape();
       }
       AllocateOutputSetMklShape(context, 0, &diff_src_tensor, diff_src_tf_shape,
-                                diff_src_mkl_shape, eager_mode);
+                                diff_src_mkl_shape, native_format);
       T* diff_src_data =
           static_cast<T*>(const_cast<T*>(diff_src_tensor->flat<T>().data()));
 
@@ -555,29 +569,8 @@ class MklConvCustomBackpropInputOp
       std::shared_ptr<stream> bwd_cpu_stream;
       bwd_cpu_stream.reset(CreateStream(context, conv_bwd_input->GetEngine()));
       // Execute conv bwd input primitive.
-      if (!eager_mode) {
-        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
-                                bwd_cpu_stream);
-      } else {
-        // In eager mode we first write the output to temporary
-        // buffer in MKL format. Then we convert the data to TF format.
-        T* tmp_data =
-            static_cast<T*>(const_cast<T*>(tmp_tensor.flat<T>().data()));
-        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data,
-                                bwd_cpu_stream);
-        auto output_tf_md = diff_src_mkl_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-        auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
-#endif
-        ReorderPd reorder_pd =
-            REORDER_PD_CONSTRUCTOR(diff_src_pd, OUTPUT_TF_MD, cpu_engine_);
-        memory* tmp_data_mem =
-            new MEMORY_CONSTRUCTOR(diff_src_pd, cpu_engine_, tmp_data);
-        memory* dst_data_mem =
-            new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, diff_src_data);
-        CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                cpu_engine_, context);
-      }
+      conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
+                              bwd_cpu_stream);
 
       // Delete primitive since it is not cached.
       if (do_not_cache) {
@@ -625,7 +618,7 @@ class MklConvCustomBackpropInputOp
   // Get TensorFlow shape of filter tensor.
   TensorShape MakeFilterTfShape(OpKernelContext* context,
                                 const Tensor& filter_tensor) {
-    return GetTfShape(context, kFilterIdx, eager_mode);
+    return GetTfShape(context, kFilterIdx, native_format);
   }
 
   // Get the Tensorflow shape of Output (diff_src),
@@ -683,7 +676,7 @@ class MklConvCustomBackpropInputOp
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
       MklConvCustomBackpropInputOp<CPUDevice, T, false, false>); \
   REGISTER_KERNEL_BUILDER(                                       \
-      Name("_MklEagerConv2DBackpropInput")                       \
+      Name("_MklNativeConv2DBackpropInput")                      \
           .Device(DEVICE_CPU)                                    \
           .TypeConstraint<T>("T")                                \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 84fa20ed221..2caa244b835 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -65,6 +65,8 @@ struct MklConvFwdParams {
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
+  MKL_TENSOR_FORMAT tf_fmt;
+  bool native_format;
   string dtypes = string("");
   struct PostOpParam {
     string name;
@@ -77,7 +79,8 @@ struct MklConvFwdParams {
   MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
                    memory::dims bias_dims, memory::dims dst_dims,
                    memory::dims strides, memory::dims dilations,
-                   memory::dims padding_left, memory::dims padding_right)
+                   memory::dims padding_left, memory::dims padding_right,
+                   MKL_TENSOR_FORMAT tf_fmt, bool native_format)
       : src_dims(src_dims),
         filter_dims(filter_dims),
         bias_dims(bias_dims),
@@ -85,7 +88,9 @@ struct MklConvFwdParams {
         strides(strides),
         dilations(dilations),
         padding_left(padding_left),
-        padding_right(padding_right) {}
+        padding_right(padding_right),
+        tf_fmt(tf_fmt),
+        native_format(native_format) {}
 };
 
 // With quantization, input, filter, and output can have different types
@@ -228,15 +233,21 @@ class MklConvFwdPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvFwdParams& convFwdDims) {
-    // Create memory descriptors for convolution data w/ no specified format
+    MEMORY_FORMAT user_data_fmt;
+    if (convFwdDims.native_format) {
+      user_data_fmt = MklTensorFormatToMklDnnDataFormat(convFwdDims.tf_fmt);
+    } else {
+      // Create memory descriptors for convolution data w/ no specified format
+      user_data_fmt = MEMORY_FORMAT::any;
+    }
     context_.src_md.reset(new memory::desc(
-        {convFwdDims.src_dims}, MklDnnType<Tinput>(), MEMORY_FORMAT::any));
+        {convFwdDims.src_dims}, MklDnnType<Tinput>(), user_data_fmt));
 
     context_.filter_md.reset(new memory::desc(
         {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), MEMORY_FORMAT::any));
 
     context_.dst_md.reset(new memory::desc(
-        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), MEMORY_FORMAT::any));
+        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), user_data_fmt));
 
     if (!convFwdDims.bias_dims.empty())
       context_.bias_md.reset(new memory::desc(
@@ -414,6 +425,9 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<float> {
     key_creator.AddAsKey(convFwdDims.padding_left);
     key_creator.AddAsKey(convFwdDims.padding_right);
     key_creator.AddAsKey(convFwdDims.dtypes);
+    if (convFwdDims.native_format) {
+      key_creator.AddAsKey(convFwdDims.tf_fmt);
+    }
 
     // Generate keys for post-ops
     for (auto const& post_op_param : convFwdDims.post_op_params) {
@@ -453,7 +467,7 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<float> {
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
           bool bias_enabled, bool pad_enabled, bool is_depthwise,
-          bool eager_mode>
+          bool native_format>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -525,8 +539,9 @@ class MklConvOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
       MklDnnShape src_mkl_shape, filter_mkl_shape;
-      GetMklShape(context, kInputIndex_Src, &src_mkl_shape, eager_mode);
-      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape, eager_mode);
+      GetMklShape(context, kInputIndex_Src, &src_mkl_shape, native_format);
+      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape,
+                  native_format);
 
       OP_REQUIRES(context, !filter_mkl_shape.IsMklTensor(),
                   errors::InvalidArgument("Filter should not be in "
@@ -557,9 +572,9 @@ class MklConvOp : public OpKernel {
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
                               dilations_);
-      auto src_tf_shape = GetTfShape(context, kInputIndex_Src, eager_mode);
+      auto src_tf_shape = GetTfShape(context, kInputIndex_Src, native_format);
       auto filter_tf_shape =
-          GetTfShape(context, kInputIndex_Filter, eager_mode);
+          GetTfShape(context, kInputIndex_Filter, native_format);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
@@ -572,17 +587,16 @@ class MklConvOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* dst_tensor = nullptr;
-      Tensor tmp_tensor;
       bool emit_filter_output = (typeid(Tinput) == typeid(Tfilter) &&
                                  typeid(Tinput) == typeid(Toutput) &&
                                  (typeid(Tinput) == typeid(float) ||
                                   typeid(Tinput) == typeid(bfloat16))) &&
-                                !eager_mode;
+                                !native_format;
       if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) {
         MklDnnShape dst_mkl_shape;
         dst_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
-                                  src_tf_shape, dst_mkl_shape, eager_mode);
+                                  src_tf_shape, dst_mkl_shape, native_format);
 
         // MklConv2D/3D also outputs converted filter as 2nd output.
         filter_mkl_shape.SetMklTensor(false);
@@ -682,18 +696,19 @@ class MklConvOp : public OpKernel {
       }
       MklConvFwdParams convFwdDims(
           src_dims, filter_dims, fuse_biasadd_ ? bias_dims : NONE_DIMS,
-          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right,
+          tf_fmt, native_format);
 
       // TODO(mdfaijul): Extend the basic parameters for data types and fusions
       this->ExtendConvFwdParams(context, convFwdDims);
       conv_fwd =
           MklConvFwdPrimitiveFactory<Tinput, Tfilter, Tbias, Ttemp_output>::Get(
               convFwdDims, do_not_cache);
-      // Allocate output tensors `output_tensor` and `filter_out_tensor`
+      // Allocate output tensors `dst_tensor` and `filter_out_tensor`
       MklDnnShape output_mkl_shape;
       std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
-                           &output_mkl_shape, &dst_tensor, &tmp_tensor);
+                           &output_mkl_shape, &dst_tensor);
 
       Tensor* filter_out_tensor = nullptr;
       if (emit_filter_output) {
@@ -772,30 +787,7 @@ class MklConvOp : public OpKernel {
         conv_fwd->Execute(src_data, filter_data, bias_data, dst_data,
                           fwd_cpu_stream);
       } else {
-        if (!eager_mode) {
-          conv_fwd->Execute(src_data, filter_data, dst_data, fwd_cpu_stream);
-        } else {
-          // In eager mode we first write the output to temporary
-          // buffer in MKL format. Then we convert the data to TF format.
-          Ttemp_output* tmp_data = reinterpret_cast<Ttemp_output*>(
-              tmp_tensor.flat<Toutput>().data());
-          conv_fwd->Execute(src_data, filter_data, tmp_data, fwd_cpu_stream);
-
-          // Now we need to convert the output to TF format.
-          auto output_tf_md = output_mkl_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-          auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
-#endif  // !ENABLE_MKLDNN_V1
-          auto dst_pd = conv_fwd_pd->PRIMITIVE_DESC_DST;
-          ReorderPd reorder_pd =
-              REORDER_PD_CONSTRUCTOR(dst_pd, OUTPUT_TF_MD, cpu_engine_);
-          memory* tmp_data_mem =
-              new MEMORY_CONSTRUCTOR(dst_pd, cpu_engine_, tmp_data);
-          memory* dst_data_mem =
-              new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, dst_data);
-          CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                  cpu_engine_, context);
-        }
+        conv_fwd->Execute(src_data, filter_data, dst_data, fwd_cpu_stream);
       }
 
       // Delete primitive since it is not cached.
@@ -911,8 +903,7 @@ class MklConvOp : public OpKernel {
                                     const memory::dims& output_dims_mkl_order,
                                     MKL_TENSOR_FORMAT output_tf_format,
                                     MklDnnShape* output_mkl_shape,
-                                    Tensor** output_tensor,
-                                    Tensor* tmp_tensor) {
+                                    Tensor** output_tensor) {
     DCHECK(output_tensor);
 #ifdef ENABLE_MKLDNN_V1
     auto dst_md = conv_prim_desc.dst_desc();
@@ -939,8 +930,7 @@ class MklConvOp : public OpKernel {
     // Allocate shape of TF tensor
     TensorShape output_tf_shape;
     output_tf_shape.AddDim((DST_MD.get_size() / sizeof(Toutput)));
-    if (eager_mode) {
-      AllocTmpBuffer<Toutput>(context, tmp_tensor, output_tf_shape);
+    if (native_format) {
       output_tf_shape = output_mkl_shape->GetTfShape();
     }
 
@@ -957,7 +947,7 @@ class MklConvOp : public OpKernel {
       } else {
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
                                   output_tf_shape, *output_mkl_shape,
-                                  eager_mode);
+                                  native_format);
 #ifdef ENABLE_MKLDNN_V1
         auto output_format_tag = MklTensorFormatToMklDnnDataFormat(
             output_mkl_shape->GetTfDataFormat());
@@ -991,7 +981,8 @@ class MklConvOp : public OpKernel {
       }
     } else {
       AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                output_tf_shape, *output_mkl_shape, eager_mode);
+                                output_tf_shape, *output_mkl_shape,
+                                native_format);
     }
   }
 
@@ -1836,8 +1827,7 @@ class MklQuantizedConv2DSumReluOp
                             const memory::dims& output_dims_mkl_order,
                             MKL_TENSOR_FORMAT output_tf_format,
                             MklDnnShape* output_mkl_shape,
-                            Tensor** output_tensor,
-                            Tensor* tmp_tensor) override {
+                            Tensor** output_tensor) override {
     int summand_idx = context->num_inputs() / 2 - 1;
     if (std::is_same<Toutput, quint8>::value) {
       summand_idx -= 2;
@@ -1869,7 +1859,7 @@ class MklQuantizedConv2DSumReluOp
               false>::AllocateOutputTensor(context, conv_prim_desc,
                                            output_dims_mkl_order,
                                            output_tf_format, output_mkl_shape,
-                                           output_tensor, tmp_tensor);
+                                           output_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -2432,7 +2422,7 @@ REGISTER_KERNEL_BUILDER(
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
       MklDummyOp<CPUDevice, T>);                                               \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("_MklEagerConv2D")                                                  \
+      Name("_MklNativeConv2D")                                                 \
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<T>("T")                                              \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 88931292ef2..28898c65ca7 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -34,9 +34,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/broadcast_to_op.h"
 #include "tensorflow/core/kernels/list_kernels.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/util/ragged_to_dense_util.h"
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index d551d1ee4bc..393231f156c 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -237,7 +237,7 @@ class ReverseV2Op : public OpKernel {
     const Tensor& input = context->input(0);
     const Tensor& sparse_dims = context->input(1);
 
-    if (TensorShapeUtils::IsScalar(input.shape())) {
+    if (TensorShapeUtils::IsScalar(input.shape()) || input.NumElements() == 0) {
       context->set_output(0, input);
     } else {
       const int input_dims = input.dims();
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 942740b9af3..04a66d39b0a 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
@@ -198,8 +199,16 @@ class TensorScatterOp : public OpKernel {
               " dimensions of updates.shape=", updates.shape().DebugString()));
     }
 
-    std::unique_ptr<Tensor> forwarded_input = c->forward_input(
-        0, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+    AllocatorAttributes alloc_attr;
+    MemoryType memory_type = DEVICE_MEMORY;
+    if (std::is_same<Device, CPUDevice>::value) {
+      alloc_attr.set_on_host(true);
+      memory_type = HOST_MEMORY;
+    } else {
+      memory_type = DEVICE_MEMORY;
+    }
+    std::unique_ptr<Tensor> forwarded_input =
+        c->forward_input(0, 0, input.dtype(), shape, memory_type, alloc_attr);
 
     if (forwarded_input == nullptr) {
       // We were not able to forward the input, so we deep copy the tensor and
@@ -317,6 +326,17 @@ class ScatterNdUpdateOp : public OpKernel {
                               .HostMemory("shape"),                   \
                           ScatterNdOp<dev##Device, type, index_type>)
 
+#define REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU(index_type, name)  \
+  REGISTER_KERNEL_BUILDER(Name(name)                                  \
+                              .Device(DEVICE_GPU)                     \
+                              .TypeConstraint<int32>("T")             \
+                              .TypeConstraint<index_type>("Tindices") \
+                              .HostMemory("indices")                  \
+                              .HostMemory("updates")                  \
+                              .HostMemory("shape")                    \
+                              .HostMemory("output"),                  \
+                          ScatterNdOp<CPUDevice, int32, index_type>)
+
 #define REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, dev, name, \
                                                 op)                          \
   REGISTER_KERNEL_BUILDER(                                                   \
@@ -326,6 +346,30 @@ class ScatterNdUpdateOp : public OpKernel {
           .TypeConstraint<index_type>("Tindices"),                           \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(index_type, name, \
+                                                          op)               \
+  REGISTER_KERNEL_BUILDER(Name(name)                                        \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<int32>("T")                   \
+                              .TypeConstraint<index_type>("Tindices")       \
+                              .HostMemory("ref")                            \
+                              .HostMemory("indices")                        \
+                              .HostMemory("updates")                        \
+                              .HostMemory("output_ref"),                    \
+                          ScatterNdUpdateOp<CPUDevice, int32, index_type, op>)
+
+#define REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INDEX_INT32_GPU( \
+    index_type, name, op)                                               \
+  REGISTER_KERNEL_BUILDER(Name(name)                                    \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("input")                      \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          ScatterNdUpdateOp<CPUDevice, int32, index_type, op>)
+
 #define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, \
                                                          dev, name, op)    \
   REGISTER_KERNEL_BUILDER(                                                 \
@@ -336,19 +380,48 @@ class ScatterNdUpdateOp : public OpKernel {
           .HostMemory("ref"),                                              \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(index_type, \
+                                                                   name, op)   \
+  REGISTER_KERNEL_BUILDER(Name(name)                                           \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<int32>("T")                      \
+                              .TypeConstraint<index_type>("Tindices")          \
+                              .HostMemory("ref")                               \
+                              .HostMemory("indices")                           \
+                              .HostMemory("updates"),                          \
+                          ScatterNdUpdateOp<CPUDevice, int32, index_type, op>)
+
 #define REGISTER_SCATTER_ND_KERNEL(type, dev, name)         \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int32, dev, name); \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int64, dev, name)
 
+#define REGISTER_SCATTER_ND_KERNEL_INT32_GPU(name)         \
+  REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU(int32, name); \
+  REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU(int64, name)
+
 #define REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)         \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(name, op)         \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int32, name, op); \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int64, name, op)
+
+#define REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INT32_GPU(name, op)    \
+  REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INDEX_INT32_GPU(int32, name, \
+                                                                 op);         \
+  REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INDEX_INT32_GPU(int64, name, \
+                                                                 op)
+
 #define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)    \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, \
                                                    op);                    \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(name, op)         \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int32, name, op); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU(int64, name, op)
+
 #define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                            \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",            \
                                     scatter_nd_op::UpdateOp::ADD);        \
@@ -361,15 +434,36 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
       type, dev, "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
 
+#define REGISTER_SCATTER_ND_ADD_SUB_INT32_GPU()                              \
+  REGISTER_SCATTER_ND_NON_ALIASING_UPDATE_KERNEL_INT32_GPU(                  \
+      "ScatterNdNonAliasingAdd", scatter_nd_op::UpdateOp::ADD);              \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdAdd",                \
+                                              scatter_nd_op::UpdateOp::ADD); \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdSub",                \
+                                              scatter_nd_op::UpdateOp::SUB); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);                 \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
+
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
 
+#define REGISTER_SCATTER_ND_INT32_GPU() \
+  REGISTER_SCATTER_ND_KERNEL_INT32_GPU("ScatterNd");
+
 #define REGISTER_SCATTER_ND_UPDATE(type, dev)                         \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate",     \
                                     scatter_nd_op::UpdateOp::ASSIGN); \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                         \
       type, dev, "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
 
+#define REGISTER_SCATTER_ND_UPDATE_INT32_GPU()             \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(             \
+      "ScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(    \
+      "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
+
 #define REGISTER_SCATTER_ND_MIN_MAX(type, dev)                          \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdMax",          \
                                     scatter_nd_op::UpdateOp::MAX);      \
@@ -380,6 +474,16 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                           \
       type, dev, "ResourceScatterNdMax", scatter_nd_op::UpdateOp::MAX);
 
+#define REGISTER_SCATTER_ND_MIN_MAX_INT32_GPU()                              \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdMax",                \
+                                              scatter_nd_op::UpdateOp::MAX); \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU("ScatterNdMin",                \
+                                              scatter_nd_op::UpdateOp::MIN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdMin", scatter_nd_op::UpdateOp::MIN);                 \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU(                      \
+      "ResourceScatterNdMax", scatter_nd_op::UpdateOp::MAX);
+
 // Registers CPU kernels.
 #define REGISTER_SCATTER_ND_ADD_SUB_CPU(type) \
   REGISTER_SCATTER_ND_ADD_SUB(type, CPU);
@@ -412,6 +516,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,    \
                                           scatter_nd_op::UpdateOp::ASSIGN>)
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterUpdate")                      \
+                              .Device(DEVICE_GPU)                          \
+                              .TypeConstraint<int32>("T")                  \
+                              .TypeConstraint<index_type>("Tindices")      \
+                              .HostMemory("tensor")                        \
+                              .HostMemory("indices")                       \
+                              .HostMemory("updates")                       \
+                              .HostMemory("output"),                       \
+                          TensorScatterOp<CPUDevice, int32, index_type,    \
+                                          scatter_nd_op::UpdateOp::ASSIGN>)
+
 #define REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                            \
                               .Device(DEVICE_##dev)                           \
@@ -420,6 +536,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::ADD>)
 
+#define REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::ADD>)
+
 #define REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                            \
                               .Device(DEVICE_##dev)                           \
@@ -428,6 +556,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::SUB>)
 
+#define REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::SUB>)
+
 #define REGISTER_SCATTER_ND_TENSOR_MIN_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterMin")                            \
                               .Device(DEVICE_##dev)                           \
@@ -436,6 +576,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::MIN>)
 
+#define REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterMin")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::MIN>)
+
 #define REGISTER_SCATTER_ND_TENSOR_MAX_TYPE_INDEX_TYPE(type, index_type, dev) \
   REGISTER_KERNEL_BUILDER(Name("TensorScatterMax")                            \
                               .Device(DEVICE_##dev)                           \
@@ -444,6 +596,18 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_ND_MIN_MAX_CPU);
                           TensorScatterOp<dev##Device, type, index_type,      \
                                           scatter_nd_op::UpdateOp::MAX>)
 
+#define REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE(index_type) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterMax")                      \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<int32>("T")               \
+                              .TypeConstraint<index_type>("Tindices")   \
+                              .HostMemory("tensor")                     \
+                              .HostMemory("indices")                    \
+                              .HostMemory("updates")                    \
+                              .HostMemory("output"),                    \
+                          TensorScatterOp<CPUDevice, int32, index_type, \
+                                          scatter_nd_op::UpdateOp::MAX>)
+
 #define REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type)                    \
   REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, CPU); \
   REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, CPU);
@@ -497,9 +661,14 @@ TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
-// TODO(b/155931747): Use HostMemory for int32
-TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
-TF_CALL_int32(REGISTER_SCATTER_ND_MIN_MAX_GPU);
+#define REGISTER_SCATTER_ND_ALL_INT32_GPU() \
+  REGISTER_SCATTER_ND_ADD_SUB_INT32_GPU();  \
+  REGISTER_SCATTER_ND_UPDATE_INT32_GPU();   \
+  REGISTER_SCATTER_ND_INT32_GPU();
+
+REGISTER_SCATTER_ND_ALL_INT32_GPU();
+REGISTER_SCATTER_ND_MIN_MAX_INT32_GPU();
+
 TF_CALL_int64(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_int64(REGISTER_SCATTER_ND_MIN_MAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
@@ -556,10 +725,27 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MIN_MAX_SYCL);
   REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type); \
   REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type);
 
+#define REGISTER_SCATTER_ND_TENSOR_INT32_GPU()                   \
+  REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE(int32);    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE(int64);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE(int32);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE(int64);    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE(int32); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE(int64);
+
 #define REGISTER_SCATTER_ND_TENSOR_GPU_MIN_MAX(type) \
   REGISTER_SCATTER_ND_TENSOR_MIN_GPU(type);          \
   REGISTER_SCATTER_ND_TENSOR_MAX_GPU(type);
 
+#define REGISTER_SCATTER_ND_TENSOR_MIN_MAX_INT32_GPU()        \
+  REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE(int32); \
+  REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE(int64); \
+  REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE(int32); \
+  REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE(int64);
+
+REGISTER_SCATTER_ND_TENSOR_INT32_GPU();
+REGISTER_SCATTER_ND_TENSOR_MIN_MAX_INT32_GPU();
+
 TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU);
 TF_CALL_int64(REGISTER_SCATTER_ND_TENSOR_GPU_MIN_MAX);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
@@ -583,15 +769,33 @@ TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_TENSOR_GPU);
 #undef REGISTER_SCATTER_ND_TENSOR_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_MIN_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_MIN_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_MAX_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_MAX_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_INT32_GPU_INDEX_TYPE
 #undef REGISTER_SCATTER_ND_TENSOR_ADD_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_SUB_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_MIN_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_MAX_GPU
 #undef REGISTER_SCATTER_ND_TENSOR_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_INT32_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_MIN_MAX_INT32_GPU
+#undef REGISTER_SCATTER_ND_ADD_SUB_INT32_GPU
+#undef REGISTER_SCATTER_ND_ALL_INT32_GPU
+#undef REGISTER_SCATTER_ND_MIN_MAX_INT32_GPU
+#undef REGISTER_SCATTER_ND_INT32_GPU
+#undef REGISTER_SCATTER_ND_UPDATE_INT32_GPU
+#undef REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INT32_GPU
+#undef REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU
+#undef REGISTER_SCATTER_ND_UPDATE_KERNEL_INT32_GPU
+#undef REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX_INT32_GPU
+#undef REGISTER_SCATTER_ND_KERNEL_INT32_GPU
+#undef REGISTER_SCATTER_ND_KERNEL_INDEX_INT32_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -761,7 +965,12 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
   auto updates_flat = updates.shaped<T, 2>({num_updates, slice_size});
 
   if (allocate) {
-    TF_RETURN_IF_ERROR(c->allocate_temp(DataTypeToEnum<T>::value, shape, out));
+    AllocatorAttributes alloc_attr;
+    if (std::is_same<Device, CPUDevice>::value) {
+      alloc_attr.set_on_host(true);
+    }
+    TF_RETURN_IF_ERROR(
+        c->allocate_temp(DataTypeToEnum<T>::value, shape, out, alloc_attr));
   } else {
     CHECK_NOTNULL(out);
   }
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 3cf357713e9..160cf4f4b24 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
index 0e4de9cdeb1..b7f601072d2 100644
--- a/tensorflow/core/kernels/softplus_op.h
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -19,7 +19,7 @@ limitations under the License.
 // nvcc.
 
 // clang-format off
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // clang-format on
 #include "tensorflow/core/framework/tensor_types.h"
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index 7919f933019..3f9dd33dd6e 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -132,18 +132,18 @@ class SpaceToDepthOp : public OpKernel {
         // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
         auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
         auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
-        functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
+        functor::SpaceToDepthOpFunctor<Device, int32, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), Tinput_v, block_size_,
                 Toutput_v);
       } else if (data_format_ == FORMAT_NCHW) {
         CHECK((std::is_same<T, RT>::value));
-        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+        functor::SpaceToDepthOpFunctor<Device, RT, FORMAT_NCHW> functor;
+        functor(context->eigen_device<Device>(), input.tensor<RT, 4>(),
                 block_size_, outputs_tensor->tensor<RT, 4>());
       } else {
         CHECK((std::is_same<T, RT>::value));
-        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NHWC> functor;
-        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+        functor::SpaceToDepthOpFunctor<Device, RT, FORMAT_NHWC> functor;
+        functor(context->eigen_device<Device>(), input.tensor<RT, 4>(),
                 block_size_, outputs_tensor->tensor<RT, 4>());
       }
     } else {
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index bf9de570fbf..799e33000ad 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -886,11 +886,11 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       const gpusparseOperation_t transB = HIPSPARSE_OPERATION_TRANSPOSE;
 
       gpusparseMatDescr_t descrA;
-      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreateMatDescr(&descrA));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+          wrap::hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+          wrap::hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
 #endif  // GOOGLE_CUDA
 
       TF_RETURN_IF_ERROR(
@@ -940,11 +940,11 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
           cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO));
 #elif TENSORFLOW_USE_ROCM
       gpusparseMatDescr_t descrA;
-      TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descrA));
+      TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreateMatDescr(&descrA));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
+          wrap::hipsparseSetMatType(descrA, HIPSPARSE_MATRIX_TYPE_GENERAL));
       TF_RETURN_IF_GPUSPARSE_ERROR(
-          hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
+          wrap::hipsparseSetMatIndexBase(descrA, HIPSPARSE_INDEX_BASE_ZERO));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
       const int m = a.dense_shape_host(0);
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 9baaa6edb7b..791ac1bac0d 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 5948121e8a3..557e73e2290 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/util/util.h"
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 8316018294b..20dccdc0627 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/lib/bfloat16/BUILD b/tensorflow/core/lib/bfloat16/BUILD
index d8213933358..00f0ff03cfc 100644
--- a/tensorflow/core/lib/bfloat16/BUILD
+++ b/tensorflow/core/lib/bfloat16/BUILD
@@ -1,7 +1,7 @@
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
+# load(
+#     "//tensorflow/core/platform:rules_cc.bzl",
+#     "cc_library",
+# )
 
 package(
     default_visibility = [
@@ -10,24 +10,23 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_library(
-    name = "bfloat16",
-    hdrs = ["bfloat16.h"],
-    deps = [
-        "//tensorflow/core/platform:byte_order",
-        "//third_party/eigen3",
-    ],
-)
+# cc_library(
+#     name = "bfloat16",
+#     hdrs = ["bfloat16.h"],
+#     deps = [
+#         "//third_party/eigen3",
+#         "//tensorflow/core/platform:byte_order",
+#     ],
+# )
 
-# Export source files needed for mobile builds, which do not use granular targets.
-filegroup(
-    name = "mobile_srcs_no_runtime",
-    srcs = [
-        "bfloat16.h",
-    ],
-)
+# # Export source files needed for mobile builds, which do not use granular targets.
+# filegroup(
+#     name = "mobile_srcs_no_runtime",
+#     srcs = [
+#         "bfloat16.h",
+#     ],
+# )
 
-# TODO(bmzhao): Remove the following once references in core/BUILD is removed.
 exports_files(
-    glob(["*"]),
+    ["bfloat16.h"],
 )
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 5f82c0ffd5f..d6ac77b6750 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -16,13 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 #define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
 
-// clang-format off
-#include "tensorflow/core/platform/byte_order.h"
-#include "third_party/eigen3/Eigen/Core"
-// clang-format on
-
-namespace tensorflow {
-typedef Eigen::bfloat16 bfloat16;
-}  // end namespace tensorflow
+#include "tensorflow/core/platform/bfloat16.h"
 
 #endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/lib/psnr/BUILD b/tensorflow/core/lib/psnr/BUILD
new file mode 100644
index 00000000000..386f1a5bd06
--- /dev/null
+++ b/tensorflow/core/lib/psnr/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = [
+        "//tensorflow/core:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "testdata",
+    srcs = [
+        "testdata/cat_q20.jpg",
+        "testdata/cat_q72.jpg",
+        "testdata/cat_q95.jpg",
+    ],
+)
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
index 1487a813149..88d2f0280f1 100644
--- a/tensorflow/core/lib/random/BUILD
+++ b/tensorflow/core/lib/random/BUILD
@@ -40,7 +40,6 @@ cc_library(
     deps = [
         ":exact_uniform_int",
         ":philox_random",
-        "//tensorflow/core/lib/bfloat16",
         "//tensorflow/core/lib/gtl:array_slice",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:macros",
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 386f13347d7..4dc2c7fee12 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -18,14 +18,13 @@ limitations under the License.
 
 #include <string.h>
 
-#include <cmath>
-
 #include <algorithm>
+#include <cmath>
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace random {
diff --git a/tensorflow/core/lib/ssim/BUILD b/tensorflow/core/lib/ssim/BUILD
new file mode 100644
index 00000000000..7d9b72b11b0
--- /dev/null
+++ b/tensorflow/core/lib/ssim/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = [
+        "//tensorflow/core:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "testdata",
+    srcs = [
+        "testdata/checkerboard1.png",
+        "testdata/checkerboard2.png",
+        "testdata/checkerboard3.png",
+    ],
+)
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 1b1aea3fab7..47ab66cd944 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -32,11 +32,14 @@ cc_library(
 tf_cc_test(
     name = "backwards_compatibility_test",
     size = "small",
-    srcs = ["backwards_compatibility_test.cc"],
+    srcs = [
+        "backwards_compatibility_test.cc",
+    ],
     data = [
         "//tensorflow/core:ops/ops.pbtxt",
+        "//tensorflow/core/ops/compat/ops_history_v1:ops_history_v1_srcs",
+        "//tensorflow/core/ops/compat/ops_history_v2:ops_history_v2_srcs",
     ] + glob([
-        "ops_history_v*/*.pbtxt",
         "ops_history.v*.pbtxt",
     ]),
     tags = [
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BUILD b/tensorflow/core/ops/compat/ops_history_v1/BUILD
new file mode 100644
index 00000000000..dfd7dab25bf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BUILD
@@ -0,0 +1,16 @@
+# Description:
+# Test for keeping the history of OpDefs for every major version of TensorFlow,
+# to validate that we don't make backwards-incompatible changes in particular
+# for v1.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "ops_history_v1_srcs",
+    srcs = glob([
+        "*.pbtxt",
+    ]),
+    visibility = ["//tensorflow/core/ops/compat:__pkg__"],
+)
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BUILD b/tensorflow/core/ops/compat/ops_history_v2/BUILD
new file mode 100644
index 00000000000..a7462807779
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BUILD
@@ -0,0 +1,16 @@
+# Description:
+# Test for keeping the history of OpDefs for every major version of TensorFlow,
+# to validate that we don't make backwards-incompatible changes in particular
+# for v2.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "ops_history_v2_srcs",
+    srcs = glob([
+        "*.pbtxt",
+    ]),
+    visibility = ["//tensorflow/core/ops/compat:__pkg__"],
+)
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
new file mode 100644
index 00000000000..abe6fb4bbd8
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "IsotonicRegression"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  output_arg {
+    name: "segments"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
index 8b6c16005b5..854e7311eab 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
@@ -12,10 +12,6 @@ op {
     name: "output_handle"
     type: DT_VARIANT
   }
-  output_arg {
-    name: "value"
-    type_attr: "value_dtype"
-  }
   attr {
     name: "key_dtype"
     type: "type"
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 7eedd8b0371..1ef0e82bf4a 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -252,7 +252,9 @@ REGISTER_OP("_FusedBatchNormEx")
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormExShape)
     .Doc(R"doc(
-*NOTE*: Do not invoke this operator directly in Python. Grappler is
+Internal FusedBatchNorm operation: reserved for internal use.
+
+Do not invoke this operator directly in Python. A fusion optimization is
 expected to create these operators.
 )doc");
 
@@ -1695,7 +1697,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklEagerConv2D")
+REGISTER_OP("_MklNativeConv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
@@ -1845,7 +1847,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklEagerConv2DBackpropFilter")
+REGISTER_OP("_MklNativeConv2DBackpropFilter")
     .Input("input: T")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
@@ -2006,7 +2008,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklEagerConv2DBackpropInput")
+REGISTER_OP("_MklNativeConv2DBackpropInput")
     .Input("input_sizes: int32")
     .Input("filter: T")
     .Input("out_backprop: T")
@@ -3406,4 +3408,16 @@ REGISTER_OP("QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize")
     .Attr("padding_list: list(int) = []")
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
+REGISTER_OP("IsotonicRegression")
+    .Input("input: T")
+    .Output("output: output_dtype")
+    .Output("segments: int32")
+    .Attr("T: realnumbertype")
+    .Attr("output_dtype: {half, bfloat16, float, double} = DT_FLOAT")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* context) {
+      context->set_output(0, context->input(0));
+      context->set_output(1, context->input(0));
+      return tensorflow::Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index f7b68c8c2a4..f8393ffa743 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -19863,6 +19863,56 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "IsotonicRegression"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  output_arg {
+    name: "segments"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Iterator"
   output_arg {
@@ -53140,10 +53190,6 @@ op {
     name: "output_handle"
     type: DT_VARIANT
   }
-  output_arg {
-    name: "value"
-    type_attr: "value_dtype"
-  }
   attr {
     name: "key_dtype"
     type: "type"
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index a889666c608..5d6f74fb1a3 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -68,6 +68,7 @@ exports_files(
         "cpu_info.cc",
         "cpu_info.h",
         "cuda_libdevice_path.h",
+        "bfloat16.h",
         "demangle.h",
         "env.cc",
         "env.h",
@@ -123,6 +124,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "bfloat16",
+    hdrs = ["bfloat16.h"],
+    deps = [
+        ":byte_order",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "blocking_counter",
     hdrs = ["blocking_counter.h"],
@@ -786,6 +796,7 @@ cc_library(
     ],
     deps = [
         ":platform",
+        ":bfloat16",
         ":tstring",
     ] + tf_platform_deps("types"),
 )
@@ -1322,6 +1333,7 @@ filegroup(
     srcs = [
         "abi.h",
         "base64.h",
+        "bfloat16.h",
         "casts.h",
         "coding.h",
         "context.h",
@@ -1408,6 +1420,7 @@ filegroup(
 filegroup(
     name = "framework_lite_hdrs",
     srcs = [
+        "bfloat16.h",
         "byte_order.h",
         "cpu_info.h",
         "ctstring.h",
@@ -1428,6 +1441,7 @@ filegroup(
 filegroup(
     name = "lib_internal_private_hdrs",
     srcs = [
+        "bfloat16.h",
         "raw_coding.h",
         "scanner.h",
         "str_util.h",
@@ -1473,6 +1487,7 @@ filegroup(
 filegroup(
     name = "tflite_portable_logging_hdrs",
     srcs = [
+        "bfloat16.h",
         "ctstring.h",
         "ctstring_internal.h",
         "logging.h",
@@ -1487,6 +1502,7 @@ filegroup(
 filegroup(
     name = "jpeg_internal_hdrs",
     srcs = [
+        "bfloat16.h",
         "ctstring.h",
         "ctstring_internal.h",
         "dynamic_annotations.h",
@@ -1504,6 +1520,7 @@ filegroup(
 filegroup(
     name = "gif_internal_hdrs",
     srcs = [
+        "bfloat16.h",
         "ctstring.h",
         "ctstring_internal.h",
         "dynamic_annotations.h",
@@ -1523,6 +1540,7 @@ filegroup(
     srcs = [
         "abi.cc",
         "abi.h",
+        "bfloat16.h",
         "blocking_counter.h",
         "byte_order.h",
         "casts.h",
diff --git a/tensorflow/core/platform/bfloat16.h b/tensorflow/core/platform/bfloat16.h
new file mode 100644
index 00000000000..3e3ab2ce55a
--- /dev/null
+++ b/tensorflow/core/platform/bfloat16.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BFLOAT16_BFLOAT16_H_
+#define TENSORFLOW_CORE_PLATFORM_BFLOAT16_BFLOAT16_H_
+
+// clang-format off
+#include "tensorflow/core/platform/byte_order.h"
+#include "third_party/eigen3/Eigen/Core"
+// clang-format on
+
+namespace tensorflow {
+typedef Eigen::bfloat16 bfloat16;
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 5553c9094cb..ec283099868 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -20,7 +20,7 @@ package_group(
     packages = [
         "//learning/brain/tfrc/...",
         "//tensorflow/...",
-        "//third_party/gstpufs/...",
+        "//third_party/gsmemcachedfs/...",
     ],
 )
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index dda65f93cda..9f84b9205f1 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -369,6 +369,7 @@ def tf_proto_library_cc(
         cc_api_version = 2,
         js_codegen = "jspb",
         create_service = False,
+        create_java_proto = False,
         make_default_target_header_only = False):
     js_codegen = js_codegen  # unused argument
     native.filegroup(
@@ -377,7 +378,7 @@ def tf_proto_library_cc(
         testonly = testonly,
         visibility = visibility,
     )
-    _ignore = create_service
+    _ignore = (create_service, create_java_proto)
 
     use_grpc_plugin = None
     if cc_grpc_version:
@@ -503,6 +504,7 @@ def tf_proto_library(
         j2objc_api_version = 1,
         js_codegen = "jspb",
         create_service = False,
+        create_java_proto = False,
         make_default_target_header_only = False,
         exports = []):
     """Make a proto library, possibly depending on other proto libraries."""
@@ -510,7 +512,7 @@ def tf_proto_library(
     # TODO(b/145545130): Add docstring explaining what rules this creates and how
     # opensource projects importing TF in bazel can use them safely (i.e. w/o ODR or
     # ABI violations).
-    _ignore = (js_codegen, exports, create_service)
+    _ignore = (js_codegen, exports, create_service, create_java_proto)
 
     native.proto_library(
         name = name,
diff --git a/tensorflow/core/platform/default/port.cc b/tensorflow/core/platform/default/port.cc
index fee82623ee0..e25ed074844 100644
--- a/tensorflow/core/platform/default/port.cc
+++ b/tensorflow/core/platform/default/port.cc
@@ -61,7 +61,13 @@ string Hostname() {
   return string(hostname);
 }
 
-string JobName() { return ""; }
+string JobName() {
+  const char* job_name_cs = std::getenv("TF_JOB_NAME");
+  if (job_name_cs != nullptr) {
+    return string(job_name_cs);
+  }
+  return "";
+}
 
 int NumSchedulableCPUs() {
 #if defined(__linux__) && !defined(__ANDROID__)
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index a38c57d1d04..4f8e49d2653 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -74,6 +74,25 @@ limitations under the License.
 #define TF_HAS_BUILTIN(x) 0
 #endif
 
+// C++11-style attributes (N2761)
+#if defined(__has_cpp_attribute)
+// Safely checks if an attribute is supported. Equivalent to
+// ABSL_HAVE_CPP_ATTRIBUTE.
+#define TF_HAS_CPP_ATTRIBUTE(n) __has_cpp_attribute(n)
+#else
+#define TF_HAS_CPP_ATTRIBUTE(n) 0
+#endif
+
+// [[clang::annotate("x")]] allows attaching custom strings (e.g. "x") to
+// declarations (variables, functions, fields, etc.) for use by tools. They are
+// represented in the Clang AST (as AnnotateAttr nodes) and in LLVM IR, but not
+// in final output.
+#if TF_HAS_CPP_ATTRIBUTE(clang::annotate)
+#define TF_ATTRIBUTE_ANNOTATE(str) [[clang::annotate(str)]]
+#else
+#define TF_ATTRIBUTE_ANNOTATE(str)
+#endif
+
 // Compilers can be told that a certain branch is not likely to be taken
 // (for instance, a CHECK failure), and use that information in static
 // analysis. Giving it this information can help it optimize for the
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index 6dc1826d93b..d61a036181d 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -54,12 +54,11 @@ uint64 AndroidArmV7ACpuUtilsHelper::GetCurrentClockCycle() {
   return static_cast<uint64>(count);
 }
 
-void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) {
+void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling() {
   if (!is_initialized_) {
     // Initialize here to avoid unnecessary initialization
     InitializeInternal();
   }
-  if (enable) {
     const int64 cpu0_scaling_min = ReadCpuFrequencyFile(0, "scaling_min");
     const int64 cpu0_scaling_max = ReadCpuFrequencyFile(0, "scaling_max");
     if (cpu0_scaling_max != cpu0_scaling_min) {
@@ -69,9 +68,14 @@ void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) {
     }
     ResetClockCycle();
     ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
-  } else {
-    ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+void AndroidArmV7ACpuUtilsHelper::DisableClockCycleProfiling() {
+  if (!is_initialized_) {
+    // Initialize here to avoid unnecessary initialization
+    InitializeInternal();
   }
+  ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
 }
 
 int64 AndroidArmV7ACpuUtilsHelper::CalculateCpuFrequency() {
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 2d94736c978..66bc0fd5928 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -36,7 +36,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   AndroidArmV7ACpuUtilsHelper() = default;
   void ResetClockCycle() final;
   uint64 GetCurrentClockCycle() final;
-  void EnableClockCycleProfiling(bool enable) final;
+  void EnableClockCycleProfiling() final;
+  void DisableClockCycleProfiling() final;
   int64 CalculateCpuFrequency() final;
 
  private:
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index b22123a804a..7cd1c4de88f 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -58,8 +58,12 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
   GetCpuUtilsHelperSingletonInstance().ResetClockCycle();
 }
 
-/* static */ void CpuUtils::EnableClockCycleProfiling(const bool enable) {
-  GetCpuUtilsHelperSingletonInstance().EnableClockCycleProfiling(enable);
+/* static */ void CpuUtils::EnableClockCycleProfiling() {
+  GetCpuUtilsHelperSingletonInstance().EnableClockCycleProfiling();
+}
+
+/* static */ void CpuUtils::DisableClockCycleProfiling() {
+  GetCpuUtilsHelperSingletonInstance().DisableClockCycleProfiling();
 }
 
 /* static */ std::chrono::duration<double> CpuUtils::ConvertClockCycleToTime(
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index d26f28478a5..1132c485f90 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -138,9 +138,10 @@ class CpuUtils {
   // clock cycle counters from overflowing on some platforms.
   static void ResetClockCycle();
 
-  // Enable clock cycle profile
+  // Enable/Disable clock cycle profile
   // You can enable / disable profile if it's supported by the platform
-  static void EnableClockCycleProfiling(bool enable);
+  static void EnableClockCycleProfiling();
+  static void DisableClockCycleProfiling();
 
   // Return chrono::duration per each clock
   static std::chrono::duration<double> ConvertClockCycleToTime(
@@ -152,7 +153,8 @@ class CpuUtils {
     DefaultCpuUtilsHelper() = default;
     void ResetClockCycle() final {}
     uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
-    void EnableClockCycleProfiling(bool /* enable */) final {}
+    void EnableClockCycleProfiling() final {}
+    void DisableClockCycleProfiling() final {}
     int64 CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
 
    private:
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index eb8161fbfd5..a18561a1156 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -26,7 +26,7 @@ static constexpr bool DBG = false;
 
 class CpuUtilsTest : public ::testing::Test {
  protected:
-  void SetUp() override { CpuUtils::EnableClockCycleProfiling(true); }
+  void SetUp() override { CpuUtils::EnableClockCycleProfiling(); }
 };
 
 TEST_F(CpuUtilsTest, SetUpTestCase) {}
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index cab7618a70a..bd63ffd0e85 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -35,9 +35,10 @@ class ICpuUtilsHelper {
   virtual void ResetClockCycle() = 0;
   // Return current clock cycle.
   virtual uint64 GetCurrentClockCycle() = 0;
-  // Enable clock cycle profile
+  // Enable/Disable clock cycle profile
   // You can enable / disable profile if it's supported by the platform
-  virtual void EnableClockCycleProfiling(bool enable) = 0;
+  virtual void EnableClockCycleProfiling() = 0;
+  virtual void DisableClockCycleProfiling() = 0;
   // Return cpu frequency.
   // CAVEAT: as this method may read file and/or call system calls,
   // this call is supposed to be slow.
diff --git a/tensorflow/core/platform/strcat.h b/tensorflow/core/platform/strcat.h
index 3569a86ab33..752cae148f3 100644
--- a/tensorflow/core/platform/strcat.h
+++ b/tensorflow/core/platform/strcat.h
@@ -117,6 +117,8 @@ class AlphaNum {
       : piece_(digits_, FloatToBuffer(f, digits_)) {}
   AlphaNum(double f)  // NOLINT(runtime/explicit)
       : piece_(digits_, DoubleToBuffer(f, digits_)) {}
+  AlphaNum(bfloat16 bf)  // NOLINT(runtime/explicit)
+      : piece_(digits_, FloatToBuffer(static_cast<float>(bf), digits_)) {}
 
   AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
diff --git a/tensorflow/core/platform/strcat_test.cc b/tensorflow/core/platform/strcat_test.cc
index 0dde19af9c9..6648c716f22 100644
--- a/tensorflow/core/platform/strcat_test.cc
+++ b/tensorflow/core/platform/strcat_test.cc
@@ -61,6 +61,21 @@ TEST(StrCat, Ints) {
   EXPECT_EQ(answer, "130");
 }
 
+TEST(StrCat, Floats) {
+  const int s = 0;
+  const float f = 1.5f;
+  const double d = 1.5;
+  const bfloat16 bf(1.5f);
+
+  string answer;
+  answer = tensorflow::strings::StrCat(s, f);
+  EXPECT_EQ(answer, "01.5");
+  answer = tensorflow::strings::StrCat(s, d);
+  EXPECT_EQ(answer, "01.5");
+  answer = tensorflow::strings::StrCat(s, bf);
+  EXPECT_EQ(answer, "01.5");
+}
+
 TEST(StrCat, Basics) {
   string result;
 
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index b2fefcaa960..e7539c411dd 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/tstring.h"
 
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 52f9e479036..16b5a328256 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -49,7 +49,13 @@ string Hostname() {
   return name;
 }
 
-string JobName() { return ""; }
+string JobName() {
+  const char* job_name_cs = std::getenv("TF_JOB_NAME");
+  if (job_name_cs != nullptr) {
+    return string(job_name_cs);
+  }
+  return "";
+}
 
 int NumSchedulableCPUs() {
   SYSTEM_INFO system_info;
diff --git a/tensorflow/core/profiler/builds/BUILD b/tensorflow/core/profiler/builds/BUILD
new file mode 100644
index 00000000000..40abf596e9f
--- /dev/null
+++ b/tensorflow/core/profiler/builds/BUILD
@@ -0,0 +1,10 @@
+package(
+    default_visibility = ["//tensorflow/core/profiler:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# ONLY FOR DEV TESTING. DO NOT USE IF YOU DO NOT KNOW ABOUT IT ALREADY.
+config_setting(
+    name = "profiler_build_oss",
+    values = {"define": "profiler_build=oss"},
+)
diff --git a/tensorflow/core/profiler/builds/build_config.bzl b/tensorflow/core/profiler/builds/build_config.bzl
new file mode 100644
index 00000000000..7c1b0a06c06
--- /dev/null
+++ b/tensorflow/core/profiler/builds/build_config.bzl
@@ -0,0 +1,14 @@
+"""Provides a redirection point for platform specific implementations of Starlark utilities."""
+
+load(
+    "//tensorflow/core/profiler/builds/oss:build_config.bzl",
+    _tf_profiler_alias = "tf_profiler_alias",
+)
+
+tf_profiler_alias = _tf_profiler_alias
+
+def if_profiler_oss(if_true, if_false = []):
+    return select({
+        "//tensorflow/core/profiler/builds:profiler_build_oss": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/tensorflow/core/profiler/builds/oss/BUILD b/tensorflow/core/profiler/builds/oss/BUILD
new file mode 100644
index 00000000000..14475f19ff3
--- /dev/null
+++ b/tensorflow/core/profiler/builds/oss/BUILD
@@ -0,0 +1,8 @@
+# Tensorflow default + linux implementations of tensorflow/core/profiler libraries.
+
+package(
+    default_visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/tensorflow/core/profiler/builds/oss/build_config.bzl b/tensorflow/core/profiler/builds/oss/build_config.bzl
new file mode 100644
index 00000000000..1dcfd0e3291
--- /dev/null
+++ b/tensorflow/core/profiler/builds/oss/build_config.bzl
@@ -0,0 +1,7 @@
+# Platform-specific build configurations.
+"""
+TF profiler build macros for use in OSS.
+"""
+
+def tf_profiler_alias(target_dir, name):
+    return target_dir + "oss:" + name
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 2274a227f4d..4931d528f50 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -108,6 +108,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 25391b99846..276181dd7bb 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
@@ -128,18 +129,20 @@ std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
 
 }  // namespace
 
-void SetCommonRecommendation(absl::string_view input_classification,
-                             absl::string_view input_statement,
-                             absl::string_view output_statement,
-                             HardwareType hardware_type,
-                             absl::string_view tf_function_statement_html,
-                             absl::string_view eager_statement_html,
-                             OverviewPageRecommendation* re) {
+void SetCommonRecommendation(
+    absl::string_view input_classification, absl::string_view input_statement,
+    absl::string_view output_statement, HardwareType hardware_type,
+    absl::string_view tf_function_statement_html,
+    absl::string_view eager_statement_html,
+    absl::string_view outside_compilation_statement_html,
+    OverviewPageRecommendation* re) {
   re->set_bottleneck(std::string(input_classification));
   re->set_statement(std::string(input_statement));
   re->set_output_statement(std::string(output_statement));
   re->set_tf_function_statement_html(std::string(tf_function_statement_html));
   re->set_eager_statement_html(std::string(eager_statement_html));
+  re->set_outside_compilation_statement_html(
+      std::string(outside_compilation_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -222,6 +225,18 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
       if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
     }
   }
+  // Figures out outside_compilation time from
+  // op_stats.device_op_metrics_db().metrics_db(). We don't use the
+  // {metrics.provenance(), metrics.name()} from
+  // device_tf_op_metrics_db.metrics_db(), because metrics.provenance() there is
+  // not set and metrics.name() can be either HLO-Op name or TF-Op name, which
+  // will confuse IsOutsideCompilationOp().
+  uint64 outside_compilation_device_op_time_ps = 0;
+  for (const OpMetrics& metrics :
+       op_stats.device_op_metrics_db().metrics_db()) {
+    if (!IsOutsideCompilationOp(metrics.provenance(), metrics.name())) continue;
+    outside_compilation_device_op_time_ps += metrics.self_time_ps();
+  }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
   analysis.set_host_tf_op_percent(
       100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
@@ -234,6 +249,9 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   analysis.set_device_op_time_eager_percent(
       100.0 * SafeDivide(eager_device_op_time_ps,
                          total_device_op_time_ps_exclude_idle));
+  analysis.set_device_op_time_outside_compilation_percent(
+      100.0 * SafeDivide(outside_compilation_device_op_time_ps,
+                         total_device_op_time_ps_exclude_idle));
   return analysis;
 }
 
@@ -315,10 +333,12 @@ std::string EagerRecommendationHtml(double host_op_time_eager_percent,
                                     double device_op_time_eager_percent) {
   std::string recommendation = "";
   if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation, host_op_time_eager_percent,
+    absl::StrAppend(&recommendation,
+                    absl::StrFormat("%.1f", host_op_time_eager_percent),
                     "% of Op time on the host used eager execution. ");
   if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation, device_op_time_eager_percent,
+    absl::StrAppend(&recommendation,
+                    absl::StrFormat("%.1f", device_op_time_eager_percent),
                     "% of Op time on the device used eager execution. ");
   if (!recommendation.empty())
     absl::StrAppend(&recommendation, "Performance could be improved with ",
@@ -327,6 +347,17 @@ std::string EagerRecommendationHtml(double host_op_time_eager_percent,
   return recommendation;
 }
 
+std::string OutsideCompilationRecommendationHtml(
+    double device_op_time_outside_compilation_percent) {
+  if (device_op_time_outside_compilation_percent <=
+      kOutsideCompilationThresholdInPercent)
+    return "";
+  return absl::StrCat(
+      absl::StrFormat("%.1lf", device_op_time_outside_compilation_percent),
+      " % of Op time on the device are for outside compilation. Performance "
+      "could be improved by avoiding outside compilation.");
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
   OverviewPage overview_page;
   *overview_page.mutable_run_environment() =
@@ -346,6 +377,9 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats) {
       EagerRecommendationHtml(
           overview_page.analysis().host_op_time_eager_percent(),
           overview_page.analysis().device_op_time_eager_percent()),
+      OutsideCompilationRecommendationHtml(
+          overview_page.analysis()
+              .device_op_time_outside_compilation_percent()),
       overview_page.mutable_recommendation());
   PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
   return overview_page;
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index 876f6847e9f..11edfc7b247 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -37,13 +37,18 @@ const double kTfFunctionReportThresholdInPercent = 20;
 // this threshold.
 const double kEagerReportThresholdInPercent = 10;
 
-void SetCommonRecommendation(absl::string_view input_classification,
-                             absl::string_view input_statement,
-                             absl::string_view output_statement,
-                             HardwareType hardware_type,
-                             absl::string_view tf_function_statement_html,
-                             absl::string_view eager_statement_html,
-                             OverviewPageRecommendation* re);
+// Reports outside-compilation opportunity in the Overview Page if the
+// percent of Op time on device that is for outside compilation is over
+// this threshold.
+const double kOutsideCompilationThresholdInPercent = 5;
+
+void SetCommonRecommendation(
+    absl::string_view input_classification, absl::string_view input_statement,
+    absl::string_view output_statement, HardwareType hardware_type,
+    absl::string_view tf_function_statement_html,
+    absl::string_view eager_statement_html,
+    absl::string_view outside_compilation_statement_html,
+    OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
     const BottleneckAnalysis& bottleneck,
@@ -63,6 +68,10 @@ std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
 std::string EagerRecommendationHtml(double host_op_time_eager_percent,
                                     double device_op_time_eager_percent);
 
+// Returns a html which provides outside-compilation related recommendation.
+std::string OutsideCompilationRecommendationHtml(
+    double device_op_time_outside_compilation_percent);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index feb3423a00e..433f8df27a6 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -60,6 +60,9 @@ message OverviewPageAnalysis {
   // Percentage of TF-op execution time on the device (excluding the idle time)
   // that are in eager mode.
   double device_op_time_eager_percent = 15;
+  // Percentage of TF-op execution time on the device (excluding the idle time)
+  // that are for outside compilation.
+  double device_op_time_outside_compilation_percent = 16;
 }
 
 // Overview result for a performance tip to users.
@@ -99,10 +102,14 @@ message OverviewPageRecommendation {
   // bottleneck.
   string output_statement = 9;
   // A statement that recommends the next steps for investigating eager-mode
-  // related bottleneck (it is a html so that it can link to other tools/docs.)
+  // related bottleneck (it is an html so that it can link to other tools/docs.)
   string eager_statement_html = 12;
+  // A statement that recommends the next steps for investigating
+  // outside-compilation related bottleneck (it is an html so that it can link
+  // to other tools/docs.)
+  string outside_compilation_statement_html = 13;
   // A statement that recommends the next steps for investigating tf-function
-  // related bottleneck (it is a html so that it can link to other tools/docs.)
+  // related bottleneck (it is an html so that it can link to other tools/docs.)
   string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 496e0c7d4d3..81861b95a3e 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,11 +1,31 @@
 load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")  # buildifier: disable=same-origin-load
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_alias")
 
 package(
+    default_visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "grpc",
+    hdrs = ["grpc.h"],
+    deps = [
+        tf_profiler_alias("//tensorflow/core/profiler/rpc/", "grpc"),
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+exports_files(
+    [
+        "grpc.h",
+    ],
+    visibility = ["//tensorflow/core/profiler/rpc:__subpackages__"],
+)
+
 cc_library(
     name = "profiler_service_impl",
     srcs = ["profiler_service_impl.cc"],
@@ -38,6 +58,7 @@ cc_library(
         "//tensorflow/python/profiler/internal:__pkg__",
     ],
     deps = [
+        ":grpc",
         ":profiler_service_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 72820ee4d6c..f1be26c6dd7 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -56,6 +56,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
+        "//tensorflow/core/profiler/rpc:grpc",
         tf_grpc_cc_dependency(),
     ],
     alwayslink = True,
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.cc b/tensorflow/core/profiler/rpc/client/profiler_client.cc
index 0d8fd8411a5..94c2bc8766f 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.cc
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.cc
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include "grpcpp/grpcpp.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/rpc/grpc.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -36,9 +38,13 @@ template <typename T>
 std::unique_ptr<typename T::Stub> CreateStub(const std::string& service_addr) {
   ::grpc::ChannelArguments channel_args;
   channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  return T::NewStub(::grpc::CreateCustomChannel(
-      "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-      channel_args));
+  // Default URI prefix is "dns:///" if not provided.
+  auto channel = ::grpc::CreateCustomChannel(
+      service_addr, ::grpc::InsecureChannelCredentials(), channel_args);
+  if (!channel) {
+    LOG(ERROR) << "Unable to create channel" << service_addr;
+  }
+  return T::NewStub(channel);
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index 9c24c78a5d8..81f9490ff76 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -130,6 +130,8 @@ Status MaybeCreateEmptyEventFile(const string& logdir) {
   // Suffix for an empty event file.  it should be kept in sync with
   // _EVENT_FILE_SUFFIX in tensorflow/python/eager/profiler.py.
   constexpr char kProfileEmptySuffix[] = ".profile-empty";
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(logdir));
+
   std::vector<string> children;
   TF_RETURN_IF_ERROR(Env::Default()->GetChildren(logdir, &children));
   for (const string& child : children) {
diff --git a/tensorflow/core/profiler/rpc/grpc.h b/tensorflow/core/profiler/rpc/grpc.h
new file mode 100644
index 00000000000..4066c6899b3
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/grpc.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC utilities
+
+#ifndef TENSORFLOW_CORE_PROFILER_COMMON_GRPC_GRPC_H_
+#define TENSORFLOW_CORE_PROFILER_COMMON_GRPC_GRPC_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Returns default credentials for use when creating a gRPC server.
+std::shared_ptr<::grpc::ServerCredentials> GetDefaultServerCredentials();
+
+// Returns default credentials for use when creating a gRPC channel.
+std::shared_ptr<::grpc::ChannelCredentials> GetDefaultChannelCredentials();
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_COMMON_GRPC_GRPC_H_
diff --git a/tensorflow/core/profiler/rpc/oss/BUILD b/tensorflow/core/profiler/rpc/oss/BUILD
new file mode 100644
index 00000000000..12bc92a68e8
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/oss/BUILD
@@ -0,0 +1,27 @@
+load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
+
+package(
+    default_visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "grpc",
+    srcs = [
+        "grpc.cc",
+        "//tensorflow/core/profiler/rpc:grpc.h",
+    ],
+    deps = [
+        tf_grpc_cc_dependency(),
+    ],
+    alwayslink = True,
+)
+
+exports_files(
+    [
+        "grpc.cc",
+    ],
+    visibility = ["//tensorflow/core/profiler/rpc:__subpackages__"],
+)
diff --git a/tensorflow/core/profiler/rpc/oss/grpc.cc b/tensorflow/core/profiler/rpc/oss/grpc.cc
new file mode 100644
index 00000000000..6e0e7ca5db2
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/oss/grpc.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/grpc.h"
+
+namespace tensorflow {
+namespace profiler {
+
+std::shared_ptr<::grpc::ServerCredentials> GetDefaultServerCredentials() {
+  return ::grpc::InsecureServerCredentials();
+}
+
+std::shared_ptr<::grpc::ChannelCredentials> GetDefaultChannelCredentials() {
+  return ::grpc::InsecureChannelCredentials();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index f05a829fb93..966a94a1116 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -23,18 +23,28 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/rpc/grpc.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
 namespace tensorflow {
 
 void ProfilerServer::StartProfilerServer(int32 port) {
-  std::string server_address = absl::StrCat("0.0.0.0:", port);
+  std::string server_address = absl::StrCat("[::]:", port);
   service_ = CreateProfilerService();
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
+
+  int selected_port = 0;
+  builder.AddListeningPort(
+      server_address, profiler::GetDefaultServerCredentials(), &selected_port);
   builder.RegisterService(service_.get());
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Profiling Server listening on " << server_address;
+  if (!selected_port) {
+    LOG(ERROR) << "Unable to bind to " << server_address << ":"
+               << selected_port;
+  } else {
+    LOG(INFO) << "Profiling Server listening on " << server_address << ":"
+              << selected_port;
+  }
 }
 
 ProfilerServer::~ProfilerServer() {
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 86566822252..85367e3ba9b 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -405,12 +405,14 @@ bool EventNode::IsEager() {
          FindParent(HostEventType::kEagerKernelExecute) != nullptr;
 }
 
-EventNode* EventNode::FindParent(int64 event_type) const {
-  if (parent_) {
-    if (parent_->GetEventVisitor().Type() == event_type) {
-      return parent_;
-    }
-    return parent_->FindParent(event_type);
+const EventNode* EventNode::FindParent(int64 event_type) const {
+  absl::flat_hash_set<const EventNode*> seen;
+  const EventNode* node = this;
+  while (node) {
+    if (seen.contains(node)) break;
+    if (node->GetEventVisitor().Type() == event_type) return node;
+    seen.insert(node);
+    node = node->GetParent();
   }
   return nullptr;
 }
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index e03acf3a37f..44026c8d99d 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -89,8 +89,8 @@ class EventNode {
 
   bool IsNestedIn(EventNode* parent);
 
-  // Returns the closest parent of the given event type.
-  EventNode* FindParent(int64 event_type) const;
+  // Returns the closest parent (including itself) of the given event type.
+  const EventNode* FindParent(int64 event_type) const;
 
   absl::optional<ContextInfo> GetProducerContext() const {
     return producer_context_;
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 863d2f79819..422e8e37d49 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -106,16 +106,18 @@ OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
   OpMetricsDb tf_op_metrics_db;
   DeviceTfOpMetricsDbBuilder builder(&tf_op_metrics_db);
   for (const auto& device_op_metrics : device_op_metrics_db.metrics_db()) {
-    if (!device_op_metrics.provenance().empty()) {
-      TfOp tf_op = ParseTfOpFullname(device_op_metrics.provenance());
-      builder.UpdateTfOpMetricsWithDeviceOpMetrics(tf_op.name, tf_op.type,
-                                                   device_op_metrics);
-    } else {
-      DCHECK(IsIdleOp(device_op_metrics));
+    if (IsIdleOp(device_op_metrics)) {
       if (with_idle) {
         builder.UpdateTfOpMetricsWithDeviceOpMetrics(kIdle, kIdle,
                                                      device_op_metrics);
       }
+    } else if (device_op_metrics.provenance().empty()) {
+      builder.UpdateTfOpMetricsWithDeviceOpMetrics(
+          device_op_metrics.name(), kUnknownOp, device_op_metrics);
+    } else {
+      TfOp tf_op = ParseTfOpFullname(device_op_metrics.provenance());
+      builder.UpdateTfOpMetricsWithDeviceOpMetrics(tf_op.name, tf_op.type,
+                                                   device_op_metrics);
     }
   }
   tf_op_metrics_db.set_total_op_time_ps(
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index 1f01e00cc8e..2e10ae59c3e 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -82,13 +82,9 @@ void DeviceOpMetricsDbBuilder::EnterOp(
   op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
-  op_metrics->set_flops(op_metrics->flops() +
-                        GetCappedPerf(flops * occurrences, self_time_ps,
-                                      peak_tera_flops_per_second_));
-  op_metrics->set_bytes_accessed(
-      op_metrics->bytes_accessed() +
-      GetCappedPerf(bytes_accessed * occurrences, self_time_ps,
-                    peak_hbm_bw_giga_bytes_per_second_ / 1000));
+  op_metrics->set_flops(op_metrics->flops() + flops * occurrences);
+  op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
+                                 bytes_accessed * occurrences);
   CombineMemoryAccessedBreakdown(
       memory_accessed_breakdown,
       op_metrics->mutable_memory_accessed_breakdown());
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index 76e6256164b..af14e1ccb8e 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -75,6 +75,16 @@ inline bool IsInfeedEnqueueOp(absl::string_view tf_op_type) {
   return tf_op_type == "InfeedEnqueue" || tf_op_type == "InfeedEnqueueTuple";
 }
 
+// Returns true if the given op is for outside compilation.
+inline bool IsOutsideCompilationOp(absl::string_view tf_op_fullname,
+                                   absl::string_view hlo_expression) {
+  if (absl::EndsWith(tf_op_fullname, ":XlaSendToHost")) return true;
+  if (absl::StrContains(hlo_expression, "send-done") &&
+      absl::StrContains(hlo_expression, "is_host_transfer=true"))
+    return true;
+  return false;
+}
+
 // Returns true if the given name is a TensorFlow embedding op.
 inline bool IsEmbeddingOp(absl::string_view tf_op_fullname) {
   return absl::StrContains(tf_op_fullname, "Embedding");
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 57bbf48ac67..204acf6b1df 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -77,6 +77,8 @@ message QueueResponse {
   // `shape` and `tensor` cannot be set in the same response.
   // Shapes of output tensors for creating remote TensorHandles.
   repeated TensorShapeProto shape = 1;
+  // Optional. If set, represents the output devices of a function.
+  repeated string device = 3;
 
   // Output tensors of a remote function. Set when Operation.id is invalid.
   repeated TensorProto tensor = 2;
@@ -178,6 +180,9 @@ message RunComponentFunctionRequest {
   fixed64 context_id = 1;
 
   Operation operation = 2;
+
+  // The output indices of its parent function.
+  repeated int32 output_num = 3;
 }
 
 message RunComponentFunctionResponse {
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 695e73f62e8..1600449e474 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -39,6 +39,13 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
+  // Enum for layout conversion between NCHW and NHWC on CPU. Default is OFF.
+  enum CpuLayout {
+    NO_CONVERSION_ON_CPU = 0;
+    NCHW_TO_NHWC = 1;
+    NHWC_TO_NCHW = 2;
+  }
+
   // Enum controlling the number of times to run optimizers. The default is to
   // run them twice.
   enum NumIterationsType {
@@ -47,6 +54,9 @@ message RewriterConfig {
     TWO = 2;
   }
 
+  // CPU Conversion settings between NHCW and NCHW.
+  CpuLayout cpu_layout_conversion = 50;
+
   // Optimize tensor layouts (default is ON)
   // e.g. This will try to use NCHW layout on GPU which is faster.
   Toggle layout_optimizer = 1;
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index c756644f7ec..83ba782f2ae 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -140,6 +140,7 @@ message SavedVariable {
   VariableSynchronization synchronization = 4;
   VariableAggregation aggregation = 5;
   string name = 6;
+  string device = 7;
 }
 
 // Represents `FunctionSpec` used in `Function`. This represents a
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index 772b57b5d20..f877bccd87a 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -131,9 +131,9 @@ class Session {
   /// `target_node_names` must be non-empty.
   ///
   /// REQUIRES: outputs is not nullptr if `output_tensor_names` is non-empty.
-  virtual Status Run(const std::vector<std::pair<string, Tensor> >& inputs,
-                     const std::vector<string>& output_tensor_names,
-                     const std::vector<string>& target_node_names,
+  virtual Status Run(const std::vector<std::pair<std::string, Tensor> >& inputs,
+                     const std::vector<std::string>& output_tensor_names,
+                     const std::vector<std::string>& target_node_names,
                      std::vector<Tensor>* outputs) = 0;
 
   /// \brief Implementations which support `RunOptions`.
@@ -169,18 +169,18 @@ class Session {
   /// discarded.
   /// NOTE: This API is still experimental and may change.
   virtual Status Run(const RunOptions& run_options,
-                     const std::vector<std::pair<string, Tensor> >& inputs,
-                     const std::vector<string>& output_tensor_names,
-                     const std::vector<string>& target_node_names,
+                     const std::vector<std::pair<std::string, Tensor> >& inputs,
+                     const std::vector<std::string>& output_tensor_names,
+                     const std::vector<std::string>& target_node_names,
                      std::vector<Tensor>* outputs, RunMetadata* run_metadata);
 
   /// \brief Like `Run` with `RunOptions` proto, but allows user to provide
   /// custom threadpool implementation via ThreadPoolOptions.
   /// NOTE: This API is still experimental and may change.
   virtual Status Run(const RunOptions& run_options,
-                     const std::vector<std::pair<string, Tensor> >& inputs,
-                     const std::vector<string>& output_tensor_names,
-                     const std::vector<string>& target_node_names,
+                     const std::vector<std::pair<std::string, Tensor> >& inputs,
+                     const std::vector<std::string>& output_tensor_names,
+                     const std::vector<std::string>& target_node_names,
                      std::vector<Tensor>* outputs, RunMetadata* run_metadata,
                      const thread::ThreadPoolOptions& threadpool_options) {
     return errors::Unimplemented(
@@ -192,19 +192,20 @@ class Session {
   /// `handle` that can be used to perform a sequence of partial feeds and
   /// fetches.
   /// NOTE: This API is still experimental and may change.
-  virtual Status PRunSetup(const std::vector<string>& input_names,
-                           const std::vector<string>& output_names,
-                           const std::vector<string>& target_nodes,
-                           string* handle);
+  virtual Status PRunSetup(const std::vector<std::string>& input_names,
+                           const std::vector<std::string>& output_names,
+                           const std::vector<std::string>& target_nodes,
+                           std::string* handle);
 
   /// \brief Continues the pending execution specified by `handle` with the
   /// provided input tensors and fills `outputs` for the endpoints specified
   /// in `output_names`.
   /// NOTE: This API is still experimental and may change.
-  virtual Status PRun(const string& handle,
-                      const std::vector<std::pair<string, Tensor> >& inputs,
-                      const std::vector<string>& output_names,
-                      std::vector<Tensor>* outputs);
+  virtual Status PRun(
+      const std::string& handle,
+      const std::vector<std::pair<std::string, Tensor> >& inputs,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor>* outputs);
 
   /// \brief List devices in the session.
   ///
@@ -338,7 +339,7 @@ Status NewSession(const SessionOptions& options, Session** out_session);
 /// If Reset succeeds, this function will return `OK()`. Otherwise, this
 /// function will return an error status.
 Status Reset(const SessionOptions& options,
-             const std::vector<string>& containers);
+             const std::vector<std::string>& containers);
 
 /// \brief Create a new session with the given options.
 ///
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index f804b8e14cb..6677ae4b273 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 492  // Updated: 2020/8/13
+#define TF_GRAPH_DEF_VERSION 503  // Updated: 2020/8/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 0a17ba3d408..d8abbd042b9 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -124,6 +124,7 @@ cc_library(
     deps = [
         ":libtftpu_header",
         ":tpu_config_c_api",
+        ":tpu_executor_api",
         "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
         "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
@@ -133,6 +134,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_executor_api",
+    srcs = ["tpu_executor_api.cc"],
+    hdrs = ["tpu_executor_api.h"],
+    deps = [
+        ":libtftpu_header",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "tpu_api_dlsym_initializer",
     srcs = if_windows(
@@ -167,6 +178,12 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "tpu_executor_init_fns",
+    hdrs = ["tpu_executor_init_fns.inc"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "tpu_node_device",
     srcs = ["tpu_node_device.cc"],
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 882947c1c65..2544e3f7e54 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -961,7 +961,7 @@ bool IsTpuDevice(const string& device_string) {
 const absl::flat_hash_set<std::string>& PlaceOnTPUOpList() {
   static const auto place_on_tpu_ops = new absl::flat_hash_set<std::string>(
       {"Identity", "IdentityN", "Enter", "Exit", "Switch", "Merge",
-       "NextIteration", "Shape"});
+       "NextIteration", "Shape", "_Retval"});
   return *place_on_tpu_ops;
 }
 
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 6d3369022ad..79b6d30f96d 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -4,6 +4,7 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library_cc",
 )
+load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_kernel_library",
@@ -45,6 +46,7 @@ cc_library(
     name = "tpu_compile_op_common",
     srcs = ["tpu_compile_op_common.cc"],
     hdrs = ["tpu_compile_op_common.h"],
+    linkopts = ["-Wl,--warn-backrefs-exclude=*/learning/brain/google/xla/_objs/tpu_compilation_metrics_google/*"],  # TODO(b/163560146) Fix the dependency issue
     deps = [
         ":tpu_compilation_cache_entry_unloader",
         ":tpu_compilation_cache_interface",
@@ -333,6 +335,7 @@ cc_library(
     name = "tpu_compilation_cache_interface",
     srcs = ["tpu_compilation_cache_interface.cc"],
     hdrs = ["tpu_compilation_cache_interface.h"],
+    linkopts = ["-Wl,--warn-backrefs-exclude=*/learning/brain/google/xla/_objs/tpu_compilation_metrics_google/*"],  # TODO(b/163560146) Fix the dependency issue.
     deps = [
         ":compiled_subgraph",
         ":tpu_compilation_cache_entry",
@@ -500,10 +503,87 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tpu_compilation_cache_rpc_support_hdrs",
+    hdrs = ["tpu_compilation_cache_rpc_support.h"],
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_cache_response_proto_cc"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_response_proto_cc"],
+    }) + [
+        ":tpu_compilation_cache_entry",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_lookup",
+        ":tpu_program_group_interface",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/platform:status",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_rpc_support",
+    srcs = ["tpu_compilation_cache_rpc_support.cc"],
+    deps = [
+        ":tpu_compilation_cache_rpc_support_hdrs",
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_rpc_lookup",
+    srcs = ["tpu_compilation_cache_rpc_lookup.cc"],
+    hdrs = ["tpu_compilation_cache_rpc_lookup.h"],
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_cache_rpc_support"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
+    }) + [
+        ":tpu_compilation_cache_grpc",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_lookup",
+        ":tpu_compilation_cache_proto_cc",
+        ":tpu_compilation_cache_rpc_support_hdrs",
+        ":tpu_program_group_interface",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+# TODO(henrytan): rename the proto file.
+tf_proto_library_cc(
+    name = "tpu_compilation_cache_response_proto",
+    srcs = ["tpu_compilation_cache_response.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    create_java_proto = False,
+    protodeps = [
+        ":tpu_compilation_cache_proto",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
+    ],
+)
+
 tf_proto_library_cc(
     name = "tpu_compilation_cache_proto",
     srcs = ["tpu_compilation_cache.proto"],
     cc_api_version = 2,
+    create_java_proto = False,
+    protodeps = [
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_grpc",
+    srcs = ["tpu_compilation_cache_grpc.cc"],
+    hdrs = ["tpu_compilation_cache_grpc.h"],
+    deps = select({
+        WITH_TPU_SUPPORT: [":tpu_compilation_cache_response_proto_cc"],
+        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_response_proto_cc"],
+    }) + [
+        ":tpu_compilation_cache_proto_cc",
+        tf_grpc_cc_dependency(),
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
index 8308cba128e..89b92ae9157 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
@@ -23,3 +23,16 @@ enum CompilationCacheFetchTarget {
   SHARDING = 2;
   UNSHARDING = 3;
 }
+
+message TpuCompilationUidAndIndex {
+  int64 uid = 1;
+  int32 proto_index = 2;
+}
+
+message GetTpuProgramRequest {
+  oneof key_oneof {
+    string key = 1;
+    TpuCompilationUidAndIndex uid_and_index = 2;
+  }
+  CompilationCacheFetchTarget fetch_target = 3;
+}
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
new file mode 100644
index 00000000000..a44518c0be6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
@@ -0,0 +1,103 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+
+#include <grpcpp/impl/codegen/async_stream.h>
+#include <grpcpp/impl/codegen/async_unary_call.h>
+#include <grpcpp/impl/codegen/channel_interface.h>
+#include <grpcpp/impl/codegen/client_callback.h>
+#include <grpcpp/impl/codegen/client_unary_call.h>
+#include <grpcpp/impl/codegen/method_handler.h>
+#include <grpcpp/impl/codegen/rpc_service_method.h>
+#include <grpcpp/impl/codegen/server_callback.h>
+#include <grpcpp/impl/codegen/service_type.h>
+#include <grpcpp/impl/codegen/sync_stream.h>
+
+#include <functional>
+namespace tensorflow {
+namespace tpu {
+
+static const char* grpcTpuCompilationCacheService_method_names[] = {
+    "/tensorflow.tpu.TpuCompilationCacheService/GetTpuProgram",
+};
+
+std::unique_ptr<grpc::TpuCompilationCacheService::Stub>
+grpc::TpuCompilationCacheService::NewStub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+    const ::grpc::StubOptions& options) {
+  (void)options;
+  std::unique_ptr<grpc::TpuCompilationCacheService::Stub> stub(
+      new grpc::TpuCompilationCacheService::Stub(channel));
+  return stub;
+}
+
+grpc::TpuCompilationCacheService::Stub::Stub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+    : channel_(channel),
+      rpcmethod_get_tpu_program_(grpcTpuCompilationCacheService_method_names[0],
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel) {}
+
+::grpc::Status grpc::TpuCompilationCacheService::Stub::GetTpuProgram(
+    ::grpc::ClientContext* context, const RequestType& request,
+    ResponseType* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_get_tpu_program_, context, request, response);
+}
+
+::grpc::ClientAsyncResponseReader<
+    grpc::TpuCompilationCacheService::ResponseType>*
+grpc::TpuCompilationCacheService::Stub::AsyncGetTpuProgramRaw(
+    ::grpc::ClientContext* context, const RequestType& request,
+    ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncResponseReaderFactory<
+      ResponseType>::Create(channel_.get(), cq, rpcmethod_get_tpu_program_,
+                            context, request, true);
+}
+
+::grpc::ClientAsyncResponseReader<
+    grpc::TpuCompilationCacheService::ResponseType>*
+grpc::TpuCompilationCacheService::Stub::PrepareAsyncGetTpuProgramRaw(
+    ::grpc::ClientContext* context, const RequestType& request,
+    ::grpc::CompletionQueue* cq) {
+  return ::grpc::internal::ClientAsyncResponseReaderFactory<
+      ResponseType>::Create(channel_.get(), cq, rpcmethod_get_tpu_program_,
+                            context, request, false);
+}
+
+grpc::TpuCompilationCacheService::Service::Service() {
+  AddMethod(new ::grpc::internal::RpcServiceMethod(
+      grpcTpuCompilationCacheService_method_names[0],
+      ::grpc::internal::RpcMethod::NORMAL_RPC,
+      new ::grpc::internal::RpcMethodHandler<
+          grpc::TpuCompilationCacheService::Service, RequestType, ResponseType>(
+          std::mem_fn(
+              &grpc::TpuCompilationCacheService::Service::GetTpuProgram),
+          this)));
+}
+
+grpc::TpuCompilationCacheService::Service::~Service() {}
+
+::grpc::Status grpc::TpuCompilationCacheService::Service::GetTpuProgram(
+    ::grpc::ServerContext* context, const RequestType* request,
+    ResponseType* response) {
+  (void)context;
+  (void)request;
+  (void)response;
+  return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
new file mode 100644
index 00000000000..39e37ad3722
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
@@ -0,0 +1,223 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Copied from auto-generated gRPC code in order to enable using grpc_call.h
+// for raw message handling.
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
+
+#include <grpcpp/impl/codegen/async_generic_service.h>
+#include <grpcpp/impl/codegen/async_stream.h>
+#include <grpcpp/impl/codegen/async_unary_call.h>
+#include <grpcpp/impl/codegen/client_callback.h>
+#include <grpcpp/impl/codegen/client_context.h>
+#include <grpcpp/impl/codegen/completion_queue.h>
+#include <grpcpp/impl/codegen/method_handler.h>
+#include <grpcpp/impl/codegen/proto_utils.h>
+#include <grpcpp/impl/codegen/rpc_method.h>
+#include <grpcpp/impl/codegen/server_callback.h>
+#include <grpcpp/impl/codegen/server_context.h>
+#include <grpcpp/impl/codegen/service_type.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/stub_options.h>
+#include <grpcpp/impl/codegen/sync_stream.h>
+
+#include <functional>
+
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_response.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace grpc {
+class TpuCompilationCacheService final {
+ public:
+  using RequestType = ::tensorflow::tpu::GetTpuProgramRequest;
+  using ResponseType = ::tensorflow::tpu::GetTpuProgramResponse;
+
+  // N.B. This must be synchronized with the method order in
+  // tpu_compilation_cache.proto.
+  enum class MethodId { kGetTpuProgram = 0 };
+
+  static constexpr char const* service_full_name() {
+    return "tensorflow.tpu.TpuCompilationCacheService";
+  }
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    // This method requests the cached proto that the TPU execute op has
+    // been instructed to execute.
+    virtual ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
+                                         const RequestType& request,
+                                         ResponseType* response) = 0;
+    std::unique_ptr<::grpc::ClientAsyncResponseReaderInterface<ResponseType>>
+    AsyncGetTpuProgram(::grpc::ClientContext* context,
+                       const RequestType& request,
+                       ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<
+          ::grpc::ClientAsyncResponseReaderInterface<ResponseType>>(
+          AsyncGetTpuProgramRaw(context, request, cq));
+    }
+    std::unique_ptr<::grpc::ClientAsyncResponseReaderInterface<ResponseType>>
+    PrepareAsyncGetTpuProgram(::grpc::ClientContext* context,
+                              const RequestType& request,
+                              ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<
+          ::grpc::ClientAsyncResponseReaderInterface<ResponseType>>(
+          PrepareAsyncGetTpuProgramRaw(context, request, cq));
+    }
+
+   private:
+    virtual ::grpc::ClientAsyncResponseReaderInterface<ResponseType>*
+    AsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                          const RequestType& request,
+                          ::grpc::CompletionQueue* cq) = 0;
+    virtual ::grpc::ClientAsyncResponseReaderInterface<ResponseType>*
+    PrepareAsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ::grpc::CompletionQueue* cq) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    explicit Stub(const std::shared_ptr<::grpc::ChannelInterface>& channel);
+    ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ResponseType* response) override;
+    std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>
+    AsyncGetTpuProgram(::grpc::ClientContext* context,
+                       const RequestType& request,
+                       ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>(
+          AsyncGetTpuProgramRaw(context, request, cq));
+    }
+    std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>
+    PrepareAsyncGetTpuProgram(::grpc::ClientContext* context,
+                              const RequestType& request,
+                              ::grpc::CompletionQueue* cq) {
+      return std::unique_ptr<::grpc::ClientAsyncResponseReader<ResponseType>>(
+          PrepareAsyncGetTpuProgramRaw(context, request, cq));
+    }
+
+   private:
+    std::shared_ptr<::grpc::ChannelInterface> channel_;
+    ::grpc::ClientAsyncResponseReader<ResponseType>* AsyncGetTpuProgramRaw(
+        ::grpc::ClientContext* context, const RequestType& request,
+        ::grpc::CompletionQueue* cq) override;
+    ::grpc::ClientAsyncResponseReader<ResponseType>*
+    PrepareAsyncGetTpuProgramRaw(::grpc::ClientContext* context,
+                                 const RequestType& request,
+                                 ::grpc::CompletionQueue* cq) override;
+    const ::grpc::internal::RpcMethod rpcmethod_get_tpu_program_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr<::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class Service : public ::grpc::Service {
+   public:
+    Service();
+    ~Service() override;
+    // This method requests the cached proto that the TPU execute op has
+    // been instructed to execute.
+    virtual ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                         const RequestType* request,
+                                         ResponseType* response);
+  };
+  template <class BaseClass>
+  class WithAsyncMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithAsyncMethod_GetTpuProgram() { ::grpc::Service::MarkMethodAsync(0); }
+    ~WithAsyncMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    void RequestGetTpuProgram(
+        ::grpc::ServerContext* context, RequestType* request,
+        ::grpc::ServerAsyncResponseWriter<ResponseType>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+
+    // Make RequestAsyncUnary accessible to grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+  typedef WithAsyncMethod_GetTpuProgram<Service> AsyncService;
+  template <class BaseClass>
+  class WithGenericMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithGenericMethod_GetTpuProgram() { ::grpc::Service::MarkMethodGeneric(0); }
+    ~WithGenericMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable synchronous version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+  };
+  template <class BaseClass>
+  class WithStreamedUnaryMethod_GetTpuProgram : public BaseClass {
+   private:
+    void BaseClassMustBeDerivedFromService(const Service* service) {}
+
+   public:
+    WithStreamedUnaryMethod_GetTpuProgram() {
+      ::grpc::Service::MarkMethodStreamed(
+          0,
+          new ::grpc::internal::StreamedUnaryHandler<RequestType, ResponseType>(
+              std::bind(&WithStreamedUnaryMethod_GetTpuProgram<
+                            BaseClass>::StreamedGetTpuProgram,
+                        this, std::placeholders::_1, std::placeholders::_2)));
+    }
+    ~WithStreamedUnaryMethod_GetTpuProgram() override {
+      BaseClassMustBeDerivedFromService(this);
+    }
+    // disable regular version of this method
+    ::grpc::Status GetTpuProgram(::grpc::ServerContext* context,
+                                 const RequestType* request,
+                                 ResponseType* response) override {
+      abort();
+      return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
+    }
+    // replace default version of method with streamed unary
+    virtual ::grpc::Status StreamedGetTpuProgram(
+        ::grpc::ServerContext* context,
+        ::grpc::ServerUnaryStreamer<RequestType, ResponseType>*
+            server_unary_streamer) = 0;
+  };
+  typedef WithStreamedUnaryMethod_GetTpuProgram<Service> StreamedUnaryService;
+  typedef Service SplitStreamedService;
+  typedef WithStreamedUnaryMethod_GetTpuProgram<Service> StreamedService;
+};
+}  // namespace grpc
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
index 8db4c11ebea..6f1fe9bdf87 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_response.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache_response.proto
new file mode 100644
index 00000000000..2b3d404e308
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_response.proto
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+syntax = "proto3";
+
+package tensorflow.tpu;
+
+import "tensorflow/core/tpu/kernels/tpu_compilation_cache.proto";
+import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+
+// Response for GetTpuProgram RPC.
+message GetTpuProgramResponse {
+  message Blob {
+    bytes data = 1;
+  }
+
+  Blob proto = 1;
+  tf2xla.HostComputeMetadata host_compute_metadata = 2;
+  bool may_modify_variables = 3;
+  Blob compiler_metadata = 4;
+  // Whether the program is empty, which could be true for sharding/unsharding
+  // entries.
+  bool is_empty = 5;
+}
+
+service TpuCompilationCacheService {
+  // This method requests the cached proto that the TPU execute op has been
+  // instructed to execute.
+  rpc GetTpuProgram(GetTpuProgramRequest) returns (GetTpuProgramResponse) {}
+}
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
new file mode 100644
index 00000000000..743229d91cf
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
@@ -0,0 +1,196 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h"
+
+#include <grpcpp/security/credentials.h>
+
+#include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+
+static constexpr absl::Duration kProtoTimeout = absl::Minutes(15);
+static gpr_timespec TimeToGprTimespec(absl::Time time) {
+  if (time == absl::InfiniteFuture()) {
+    return gpr_inf_future(GPR_CLOCK_REALTIME);
+  }
+  if (time == absl::InfinitePast()) {
+    return gpr_inf_past(GPR_CLOCK_REALTIME);
+  }
+
+  gpr_timespec spec;
+  timespec t = absl::ToTimespec(time);
+  spec.tv_sec = t.tv_sec;
+  spec.tv_nsec = static_cast<int32_t>(t.tv_nsec);
+  spec.clock_type = GPR_CLOCK_REALTIME;
+  return spec;
+}
+}  // namespace
+TpuCompilationCacheRpcLookup::TpuCompilationCacheRpcLookup(
+    const std::string& server_address, int64 max_cache_size)
+    : max_cache_size_(max_cache_size) {
+  // Ensure that large TPU program can get sent over the channel.
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  auto channel =
+      ::grpc::CreateCustomChannel(absl::StrCat("dns:///", server_address),
+                                  CreateChannelCredentials(), args);
+  stub_ = tpu::grpc::TpuCompilationCacheService::NewStub(channel);
+  VLOG(1) << "Created RPC lookup cache size " << max_cache_size_ << " bytes.";
+}
+
+Status TpuCompilationCacheRpcLookup::Lookup(
+    const std::string& proto_key,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    tpu::CompilationCacheFetchTarget fetch_target) {
+  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup",
+                                         /*level=*/2);
+  entry->reset();
+  std::shared_ptr<CacheEntry> cache_entry;
+  // Keep a reference to CacheEntry objects evicted from the cache so that the
+  // potential deletion happens outside the lock upon method exit.
+  std::vector<std::shared_ptr<CacheEntry>> removed_entries;
+
+  std::string local_proto_key = absl::StrCat(
+      proto_key, "_", tpu::CompilationCacheFetchTarget_Name(fetch_target));
+
+  {
+    absl::MutexLock lock(&mu_);
+    auto iter = cache_.find(local_proto_key);
+    if (iter == cache_.end()) {
+      tpu::GetTpuProgramRequest request;
+      request.set_key(proto_key);
+      request.set_fetch_target(fetch_target);
+      TF_RETURN_IF_ERROR(
+          RemoteLookupLocked(local_proto_key, request, &cache_entry));
+    } else {
+      VLOG(1) << "Found key " << local_proto_key << " in local proto cache.";
+      cache_entry = iter->second;
+      auto erased = entries_by_last_use_.erase(cache_entry->last_use);
+      CHECK_EQ(erased, 1);
+    }
+    PostLookupLocked(&cache_entry, entry, &removed_entries);
+  }
+  return Status::OK();
+}
+
+Status TpuCompilationCacheRpcLookup::Lookup(
+    int64 uid, int proto_index,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    tpu::CompilationCacheFetchTarget fetch_target) {
+  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache lookup by uid",
+                                         /*level=*/2);
+  entry->reset();
+  std::shared_ptr<CacheEntry> cache_entry;
+  // Keep a reference to CacheEntry objects evicted from the cache so that the
+  // potential deletion happens outside the lock upon method exit.
+  std::vector<std::shared_ptr<CacheEntry>> removed_entries;
+
+  // Make a string key so that we can uniformly store cached entries under
+  // string keys whether they are looked up by proto_key or uid+index. The
+  // expectation is that any given executable will only ever be looked up
+  // *either* by proto_key *or* by uid+index, so we are not concerned that the
+  // same proto could be placed in the cache twice if it is looked up by both
+  // methods.
+  std::string local_proto_key =
+      absl::StrCat(" _ ", uid, ":", proto_index, "_",
+                   tpu::CompilationCacheFetchTarget_Name(fetch_target));
+  {
+    absl::MutexLock lock(&mu_);
+    auto iter = cache_.find(local_proto_key);
+    if (iter == cache_.end()) {
+      tpu::GetTpuProgramRequest request;
+      tpu::TpuCompilationUidAndIndex* uid_and_index =
+          request.mutable_uid_and_index();
+      uid_and_index->set_uid(uid);
+      uid_and_index->set_proto_index(proto_index);
+      request.set_fetch_target(fetch_target);
+      TF_RETURN_IF_ERROR(
+          RemoteLookupLocked(local_proto_key, request, &cache_entry));
+    } else {
+      VLOG(1) << "Found uid " << uid << " and index " << proto_index
+              << " in local proto cache.";
+      cache_entry = iter->second;
+      auto erased = entries_by_last_use_.erase(cache_entry->last_use);
+      CHECK_EQ(erased, 1);
+    }
+    PostLookupLocked(&cache_entry, entry, &removed_entries);
+  }
+  return Status::OK();
+}
+
+Status TpuCompilationCacheRpcLookup::RemoteLookupLocked(
+    const std::string& local_proto_key,
+    const tpu::GetTpuProgramRequest& request,
+    std::shared_ptr<CacheEntry>* cache_entry) {
+  profiler::TraceMe proto_lookup_traceme("Remote TPU proto cache fetch",
+                                         /*level=*/2);
+  // Perform the RPC while holding the lock unless it is demonstrated that
+  // this causes a performance problem.
+  ::grpc::ClientContext client_context;
+  client_context.set_deadline(TimeToGprTimespec(::absl::Now() + kProtoTimeout));
+  client_context.set_compression_algorithm(GRPC_COMPRESS_GZIP);
+
+  tpu::GetTpuProgramResponse response;
+  Status s =
+      FromGrpcStatus(stub_->GetTpuProgram(&client_context, request, &response));
+  VLOG(1) << "Looked up key " << local_proto_key
+          << " in remote subgraph cache status " << s;
+  TF_RETURN_IF_ERROR(s);
+
+  TF_RETURN_IF_ERROR(FillCacheEntryFromGetTpuProgramResponse(
+      local_proto_key, &response, cache_entry));
+  cache_.emplace(local_proto_key, (*cache_entry));
+  cache_size_ += (*cache_entry)->size;
+
+  return Status::OK();
+}
+
+void TpuCompilationCacheRpcLookup::PostLookupLocked(
+    std::shared_ptr<CacheEntry>* cache_entry,
+    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    std::vector<std::shared_ptr<CacheEntry>>* removed_entries) {
+  (*cache_entry)->last_use = use_counter_++;
+  entries_by_last_use_[(*cache_entry)->last_use] = cache_entry->get();
+  *entry =
+      std::unique_ptr<CompilationCacheEntryRef>(new CacheWrapper(*cache_entry));
+
+  // Evict overflowing entries if necessary, but never evict the most recently
+  // used entry.
+  while (entries_by_last_use_.size() > 1 && cache_size_ > max_cache_size_) {
+    auto entry_to_evict = entries_by_last_use_.begin()->second;
+    entries_by_last_use_.erase(entry_to_evict->last_use);
+    CHECK_GE(cache_size_, entry_to_evict->size);
+    cache_size_ -= entry_to_evict->size;
+    // Delete the cache's reference to the entry, though clients may still be
+    // holding onto references. We use 'removed_entries' to delay the possible
+    // CacheEntry destruction until the mu_ lock is released.
+    auto entry_to_evict_it = cache_.find(entry_to_evict->key);
+    CHECK(entry_to_evict_it != cache_.end())
+        << "Missing entry key: " << entry_to_evict->key;
+    removed_entries->push_back(entry_to_evict_it->second);
+    cache_.erase(entry_to_evict_it);
+  }
+}
+
+std::string TpuCompilationCacheRpcLookup::DebugString() const {
+  return "TpuCompilationCacheRpcLookup";
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
new file mode 100644
index 00000000000..4fbda6083ab
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Class for looking up and caching TPU program via RPC.
+class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
+ public:
+  TpuCompilationCacheRpcLookup(const string& server_address,
+                               int64 max_cache_size);
+  ~TpuCompilationCacheRpcLookup() override = default;
+
+  Status Lookup(const string& proto_key,
+                std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+                tpu::CompilationCacheFetchTarget fetch_target) override;
+
+  Status Lookup(int64 uid, int proto_index,
+                std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+                tpu::CompilationCacheFetchTarget fetch_target) override;
+
+  string DebugString() const override;
+
+ private:
+  // Helper method to make the RPC request to the central cache.
+  Status RemoteLookupLocked(const string& local_proto_key,
+                            const tpu::GetTpuProgramRequest& request,
+                            std::shared_ptr<CacheEntry>* cache_entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Helper method to adjust datastructures after a cache lookup.
+  // We use `removed_entries` so that actual CacheEntry destruction happens
+  // outside the lock.
+  void PostLookupLocked(
+      std::shared_ptr<CacheEntry>* cache_entry,
+      std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
+      std::vector<std::shared_ptr<CacheEntry>>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // evicted.
+  const int64 max_cache_size_;
+
+  std::unique_ptr<tpu::grpc::TpuCompilationCacheService::Stub> stub_;
+
+  // Protect concurrent access to member variables below.
+  mutable absl::Mutex mu_;
+
+  // The total size of entries in the cache.
+  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // The entries that can be looked up in the cache. An entry is deleted from
+  // the cache as soon as it is evicted, but the underlying shared_ptr won't be
+  // freed until any wrappers holding it go out of scope.
+  std::unordered_map<std::string, std::shared_ptr<CacheEntry>> cache_
+      ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to evict entries in LRU order.
+  std::map<int64, CacheEntry*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+};
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_RPC_LOOKUP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
new file mode 100644
index 00000000000..62df149c87a
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+
+namespace tensorflow {
+namespace tpu {
+std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials() {
+  return ::grpc::InsecureChannelCredentials();
+}
+
+Status FillCacheEntryFromGetTpuProgramResponse(
+    absl::string_view local_proto_key, GetTpuProgramResponse* response,
+    std::shared_ptr<CacheEntry>* cache_entry) {
+  // TODO(b/162904194): implement this method.
+  LOG(FATAL) << "Not implemented yet.";
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
new file mode 100644
index 00000000000..5d717df392b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SUPPORT_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SUPPORT_H_
+
+#include <grpcpp/security/credentials.h>
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// A cache entry for remote TPU compilation.
+struct CacheEntry {
+  CacheEntry() : size(0), last_use(-1) {}
+  virtual ~CacheEntry() {
+    if (tpu_program_group != nullptr) {
+      tpu_program_group->UnloadAndDestroyPrograms();
+    }
+  }
+  std::unique_ptr<TpuProgramGroupInterface> tpu_program_group;
+  std::string key;
+  int64 size;
+
+  // An integer-based monotonically increasing counter used by the TPU
+  // compilation cache to sort and evict the least recently used entry when the
+  // cache size exceeded the maximum size limit. The value is initialized to
+  // `-1` as an initial value.
+  int64 last_use;
+};
+
+// Implementation of `CompilationCacheEntryRef` that holds a shared_ptr to the
+// local cache entry until the wrapper is destroyed.
+class CacheWrapper : public CompilationCacheEntryRef {
+ public:
+  explicit CacheWrapper(std::shared_ptr<CacheEntry> entry)
+      : cache_entry_(std::move(entry)) {}
+  ~CacheWrapper() override = default;
+
+  TpuCompilationCacheEntry get() override {
+    if (cache_entry_->size == 0) {
+      // Create an empty entry if the size is 0. This corresponds to
+      // non-existing sharding/unsharding entries.
+      return TpuCompilationCacheEntry();
+    }
+    return TpuCompilationCacheEntry(cache_entry_->tpu_program_group.get(),
+                                    /*core_index=*/0);
+  }
+
+  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override {
+    LOG(FATAL) << "Not implemented by designed.";
+  }
+
+ private:
+  std::shared_ptr<CacheEntry> cache_entry_;
+};
+
+// Forward declaration.
+class GetTpuProgramResponse;
+
+// Creates gRPC channel credentials for the current runtime env.
+std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials();
+
+// Fills an uinitialized `CacheEntry` from `GetTpuProgramResponse` proto. The
+// `cache_entry` will be instantiated by the function.
+Status FillCacheEntryFromGetTpuProgramResponse(
+    const absl::string_view local_proto_key, GetTpuProgramResponse* response,
+    std::shared_ptr<CacheEntry>* cache_entry);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SUPPORT_H_
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index 40b85e2cfbd..a5cdfd466a6 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -69,7 +69,8 @@ void TpuTransferAsyncOpKernel::ComputeAsync(OpKernelContext* ctx,
 }
 
 Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
-  auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform();
+  auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform(
+      /*initialize_platform=*/false);
 
   int real_device_ordinal = device_ordinal_;
   if (real_device_ordinal < 0) {
diff --git a/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc b/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc
index f7c33e57fa0..fc15d71dfd8 100644
--- a/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/segment_reduction_ops.cc
@@ -116,12 +116,44 @@ class UnsortedSegmentSum : public XlaOpKernel {
                       indices_shape.dim_size(d)));
     }
     xla::XlaBuilder* builder = ctx->builder();
+    // data shape = [indices_shape, segment_shape]
+    // buffer shape = [num_segment, segment_shape]
+    // We now create the buffer shape by reverse enginerring data shape into
+    // indices shape and segment shape.
     TensorShape buffer_shape = data_shape;
     buffer_shape.RemoveDimRange(0, indices_shape.dims());
     buffer_shape.InsertDim(0, num_segments);
+
     auto buffer = xla::Broadcast(XlaHelpers::Zero(builder, dtype_),
                                  buffer_shape.dim_sizes());
 
+    // Build dynamic dim sizes for buffer, as well as whether each dimension
+    // size is dynamic or static. We build two parts: num_sgement part and
+    // segment_shape part.
+    std::vector<xla::XlaOp> buffer_dims;
+    std::vector<bool> buffer_dims_are_dynamic;
+    // Build the "num_segment" part.
+    bool num_segments_is_dynamic;
+    OP_REQUIRES_OK(
+        ctx, ctx->ResolveInputDynamismIntoPred(2, &num_segments_is_dynamic));
+
+    buffer_dims.insert(buffer_dims.begin(), ctx->Input(2));
+    buffer_dims_are_dynamic.insert(buffer_dims_are_dynamic.begin(),
+                                   num_segments_is_dynamic);
+    // Build the segment shape part.
+    for (int64 i = indices_shape.dims(); i < data_shape.dims(); ++i) {
+      buffer_dims.push_back(xla::GetDimensionSize(data, i));
+      buffer_dims_are_dynamic.push_back(
+          ctx->InputXlaShape(0)->is_dynamic_dimension(i));
+    }
+
+    for (int64 i = 0; i < buffer_dims.size(); ++i) {
+      if (buffer_dims_are_dynamic[i]) {
+        // For each dynamic dimension, call set-dimension-size on it.
+        buffer = xla::SetDimensionSize(buffer, buffer_dims[i], i);
+      }
+    }
+
     auto combiner = [](xla::XlaOp a, xla::XlaOp b, xla::XlaBuilder* builder) {
       return a + b;
     };
diff --git a/tensorflow/core/tpu/ops/tpu_compile_op.cc b/tensorflow/core/tpu/ops/tpu_compile_op.cc
index 6f62e36f857..bfd29a7b1e7 100644
--- a/tensorflow/core/tpu/ops/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@@ -43,7 +43,22 @@ REGISTER_OP("_TPUCompileMlir")
         c->set_output(i + 1, c->Vector(2));
       }
       return Status::OK();
-    });
+    })
+    .Doc(
+        R"(
+Compiles a computations for execution on one or more TPU devices.
+For the internal use of the distributed TPU compiler.
+
+'mlir_module' is a serialized MLIR module with a `main` function that contains
+target computation.
+'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
+known statically at TPUReplication rewrite time.
+'metadata' is a serialized TPUCompileMetadataProto describing the shapes and
+types of the inputs to the computation, as well as a mapping onto the TPU pod
+topology.
+'program' output is a string key that is passed to the TPUExecute op and used to
+look up the program in the compilation cache.
+)");
 
 REGISTER_OP("_TPUCompileMlirPlaceholderProgramKey")
     .SetIsStateful()
diff --git a/tensorflow/core/tpu/tpu_api.cc b/tensorflow/core/tpu/tpu_api.cc
index cd6ca80e4e7..17520ea6ea4 100644
--- a/tensorflow/core/tpu/tpu_api.cc
+++ b/tensorflow/core/tpu/tpu_api.cc
@@ -48,11 +48,6 @@ TfTpu_TpuProgramApiFn* TpuProgramApiFn() {
   return &tpu_program_api_fn;
 }
 
-TfTpu_ExecutorApiFn* ExecutorApiFn() {
-  static TfTpu_ExecutorApiFn executor_api_fn;
-  return &executor_api_fn;
-}
-
 TfTpu_NodeContextApiFn* NodeContextApiFn() {
   static TfTpu_NodeContextApiFn node_context_api_fn;
   return &node_context_api_fn;
diff --git a/tensorflow/core/tpu/tpu_api.h b/tensorflow/core/tpu/tpu_api.h
index b6edbfd14bb..a9f7bccfdb4 100644
--- a/tensorflow/core/tpu/tpu_api.h
+++ b/tensorflow/core/tpu/tpu_api.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/core/tpu/tpu_executor_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
 namespace tensorflow {
@@ -40,8 +40,6 @@ TfTpu_ExecuteApiFn* ExecuteApiFn();
 
 TfTpu_TpuProgramApiFn* TpuProgramApiFn();
 
-TfTpu_ExecutorApiFn* ExecutorApiFn();
-
 TfTpu_NodeContextApiFn* NodeContextApiFn();
 
 TfTpu_UtilApiFn* UtilApiFn();
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 2f11e06cced..4dc09770c38 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -72,7 +72,7 @@ Status InitializeTpuLibrary(void* library_handle) {
 }
 
 bool FindAndLoadTpuLibrary() {
-  void* library = dlopen("libtftpu.so", RTLD_NOW);
+  void* library = dlopen("libtpu.so", RTLD_NOW);
   if (library) {
     InitializeTpuLibrary(library);
   }
diff --git a/tensorflow/core/tpu/tpu_executor_api.cc b/tensorflow/core/tpu/tpu_executor_api.cc
new file mode 100644
index 00000000000..dd02ca27aa4
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_executor_api.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_executor_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TfTpu_ExecutorApiFn* ExecutorApiFn() {
+  static TfTpu_ExecutorApiFn executor_api_fn;
+  return &executor_api_fn;
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_executor_api.h b/tensorflow/core/tpu/tpu_executor_api.h
new file mode 100644
index 00000000000..ee07dc618a6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_executor_api.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_EXECUTOR_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_EXECUTOR_API_H_
+
+#include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TfTpu_ExecutorApiFn* ExecutorApiFn();
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_EXECUTOR_API_H_
diff --git a/tensorflow/core/tpu/tpu_executor_init_fns.inc b/tensorflow/core/tpu/tpu_executor_init_fns.inc
new file mode 100644
index 00000000000..3e140aa828e
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_executor_init_fns.inc
@@ -0,0 +1,142 @@
+namespace {
+
+tensorflow::Status SetExecutorStructFn(void* library_handle) {
+  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
+
+  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetTopologyPtr);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetHostLocation);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetCoreLocation);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_SET_FN(executor_fn, TpuStream_New);
+  TFTPU_SET_FN(executor_fn, TpuStream_Free);
+  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
+  TFTPU_SET_FN(executor_fn, TpuStream_Status);
+  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_SET_FN(executor_fn, TpuEvent_New);
+  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuTimer_New);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
+
+  TFTPU_SET_FN(executor_fn, TpuStatus_New);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Set);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
+
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
+
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetInfeedLayout);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_LinearizeToBuffers);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_FreeBuffers);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToInfeed);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferBuffersToInfeed);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_ResetDevices);
+
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_AssignDevices);
+
+  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerHost);
+  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_X);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Y);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Z);
+  TFTPU_SET_FN(executor_fn, TpuTopology_HasChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Core);
+  TFTPU_SET_FN(executor_fn, TpuTopology_NumCores);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Cores);
+  TFTPU_SET_FN(executor_fn, TpuTopology_IdForHost);
+  TFTPU_SET_FN(executor_fn, TpuTopology_Version);
+
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_HostCoordinates);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Index);
+  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Id);
+
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_Id);
+
+  TFTPU_SET_FN(executor_fn, TpuCompiler_New);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuCompiler_RunHloPasses);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_RunBackend);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_Compile);
+  TFTPU_SET_FN(executor_fn, TpuCompiler_ShapeSize);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_Free);
+
+  TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
+  TFTPU_SET_FN(executor_fn, XlaShapeToTpuPaddedShape);
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index 64113a9c496..16494d0aa86 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -1,3 +1,5 @@
+#include "third_party/tensorflow/core/tpu/tpu_executor_init_fns.inc"
+
 namespace {
 
 tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
@@ -70,143 +72,6 @@ tensorflow::Status SetTpuProgramStructFn(void* library_handle) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status SetExecutorStructFn(void* library_handle) {
-  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
-
-  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_GetTopologyPtr);
-  TFTPU_SET_FN(executor_fn, TpuPlatform_GetHostLocation);
-
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
-
-  TFTPU_SET_FN(executor_fn, TpuStream_New);
-  TFTPU_SET_FN(executor_fn, TpuStream_Free);
-  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
-  TFTPU_SET_FN(executor_fn, TpuStream_Status);
-  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
-  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
-
-  TFTPU_SET_FN(executor_fn, TpuEvent_New);
-  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuTimer_New);
-  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
-  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
-  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
-
-  TFTPU_SET_FN(executor_fn, TpuStatus_New);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Set);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
-  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
-
-  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
-  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
-  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
-  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
-
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetInfeedLayout);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_LinearizeToBuffers);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_FreeBuffers);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToInfeed);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferBuffersToInfeed);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromOutfeed);
-  TFTPU_SET_FN(executor_fn, TpuTransferManager_ResetDevices);
-
-  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
-  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerHost);
-  TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerChip);
-  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_X);
-  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Y);
-  TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Z);
-  TFTPU_SET_FN(executor_fn, TpuTopology_HasChip);
-  TFTPU_SET_FN(executor_fn, TpuTopology_Core);
-  TFTPU_SET_FN(executor_fn, TpuTopology_NumCores);
-  TFTPU_SET_FN(executor_fn, TpuTopology_Cores);
-  TFTPU_SET_FN(executor_fn, TpuTopology_IdForHost);
-  TFTPU_SET_FN(executor_fn, TpuTopology_Version);
-
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_ChipCoordinates);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_HostCoordinates);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Index);
-  TFTPU_SET_FN(executor_fn, TpuCoreLocation_Id);
-
-  TFTPU_SET_FN(executor_fn, TpuHostLocation_Id);
-
-  TFTPU_SET_FN(executor_fn, TpuCompiler_New);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_Free);
-
-  TFTPU_SET_FN(executor_fn, TpuCompiler_RunHloPasses);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_RunBackend);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_Compile);
-  TFTPU_SET_FN(executor_fn, TpuCompiler_ShapeSize);
-  TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
-  TFTPU_SET_FN(executor_fn, TpuExecutable_Free);
-
-  TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
-  TFTPU_SET_FN(executor_fn, XlaShapeToTpuPaddedShape);
-
-  return tensorflow::Status::OK();
-}
-
 tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
   auto* node_context_fn = tensorflow::tpu::NodeContextApiFn();
 
diff --git a/tensorflow/core/tpu/tpu_on_demand_compiler.cc b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
index eae7ff86835..b28cf62d123 100644
--- a/tensorflow/core/tpu/tpu_on_demand_compiler.cc
+++ b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
@@ -129,6 +129,21 @@ class TpuExecutable : public Executable {
     ExecutorApiFn()->TpuExecutable_ExecuteAsyncOnStreamFn(
         se_executable_, &se_run_options, se_args, arguments.size(), nullptr,
         &se_execution_output, status.c_status);
+
+    for (int i = 0; i < arguments.size(); ++i) {
+      ApiConverter::Free(&se_args[i]->shape_tree.shape);
+      ApiConverter::Free(&se_args[i]->dynamic_shape);
+      ApiConverter::Free(&se_args[i]->host_shape);
+
+      for (int j = 0; j < se_args[i]->unowned_indices_size; ++i) {
+        ApiConverter::Free(&se_args[i]->unowned_indices[j]);
+      }
+
+      delete[] se_args[i]->shape_tree.buffers;
+      delete se_args[i];
+    }
+    delete[] se_args;
+
     if (!status.ok()) {
       return status.status();
     }
@@ -223,6 +238,8 @@ class TpuCompiler : public Compiler {
     }
     HloModuleProto result_proto =
         stream_executor::tpu::DeserializeProto<HloModuleProto>(result.proto);
+    stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
+    stream_executor::tpu::SerializedProto_Free(result.proto);
     return HloModule::CreateFromProto(result_proto, module->config());
   }
 
@@ -258,6 +275,7 @@ class TpuCompiler : public Compiler {
 
     std::unique_ptr<Executable> exec =
         absl::make_unique<TpuExecutable>(result, std::move(module));
+    stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
     return exec;
   }
 
@@ -308,6 +326,10 @@ class TpuCompiler : public Compiler {
                                                         std::move(modules[i]));
     }
 
+    stream_executor::tpu::SerializedProto_Free(se_module_group.proto);
+    delete se_module_group.module_config;
+    delete[] se_executables;
+
     return executables;
   }
 
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 4d2ff9a8058..7b2c5c47d29 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -626,9 +626,9 @@ tf_kernel_library(
         "//tensorflow/core:lib",
     ] + if_cuda([
         "//tensorflow/stream_executor/cuda:cusparse_lib",
-        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
-        "@local_config_rocm//rocm:hipsparse",
+        "//tensorflow/stream_executor/rocm:hipsparse_wrapper",
     ]),
 )
 
diff --git a/tensorflow/core/util/cuda_sparse.h b/tensorflow/core/util/cuda_sparse.h
index 76580766d69..cd10ba8d8cb 100644
--- a/tensorflow/core/util/cuda_sparse.h
+++ b/tensorflow/core/util/cuda_sparse.h
@@ -46,7 +46,7 @@ using gpusparseSpMMAlg_t = cusparseSpMMAlg_t;
 
 #elif TENSORFLOW_USE_ROCM
 
-#include "rocm/include/hipsparse/hipsparse.h"
+#include "tensorflow/stream_executor/rocm/hipsparse_wrapper.h"
 
 using gpusparseStatus_t = hipsparseStatus_t;
 using gpusparseOperation_t = hipsparseOperation_t;
@@ -485,7 +485,7 @@ class GpuSparseMatrixDescriptor {
 #if GOOGLE_CUDA
     TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateMatDescr(&descr_));
 #elif TENSORFLOW_USE_ROCM
-    TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreateMatDescr(&descr_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreateMatDescr(&descr_));
 #endif
     initialized_ = true;
     return Status::OK();
@@ -507,7 +507,7 @@ class GpuSparseMatrixDescriptor {
 #if GOOGLE_CUDA
       cusparseDestroyMatDescr(descr_);
 #elif TENSORFLOW_USE_ROCM
-      hipsparseDestroyMatDescr(descr_);
+      wrap::hipsparseDestroyMatDescr(descr_);
 #endif
       initialized_ = false;
     }
diff --git a/tensorflow/core/util/rocm_sparse.cc b/tensorflow/core/util/rocm_sparse.cc
index cc7b56fdc01..22c2af780c7 100644
--- a/tensorflow/core/util/rocm_sparse.cc
+++ b/tensorflow/core/util/rocm_sparse.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/util/cuda_sparse.h"
 
 namespace tensorflow {
+
 namespace {
 
 // A set of initialized handles to the underlying ROCm libraries used by
@@ -67,9 +68,9 @@ class HipSparseHandles {
 
   Status Initialize() {
     if (initialized_) return Status::OK();
-    TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseCreate(&hipsparse_handle_));
+    TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseCreate(&hipsparse_handle_));
     TF_RETURN_IF_GPUSPARSE_ERROR(
-        hipsparseSetStream(hipsparse_handle_, stream_));
+        wrap::hipsparseSetStream(hipsparse_handle_, stream_));
     initialized_ = true;
     return Status::OK();
   }
@@ -88,7 +89,7 @@ class HipSparseHandles {
   void Release() {
     if (initialized_) {
       // This should never return anything other than success
-      auto err = hipsparseDestroy(hipsparse_handle_);
+      auto err = wrap::hipsparseDestroy(hipsparse_handle_);
       DCHECK(err == HIPSPARSE_STATUS_SUCCESS)
           << "Failed to destroy hipSPARSE instance.";
       initialized_ = false;
@@ -156,23 +157,23 @@ Status GpuSparse::Initialize() {
 #define TF_CALL_HIP_LAPACK_TYPES(m) m(float, S) m(double, D)
 
 // Macros to construct hipsparse method names.
-#define SPARSE_FN(method, sparse_prefix) hipsparse##sparse_prefix##method
+#define SPARSE_FN(method, sparse_prefix) wrap::hipsparse##sparse_prefix##method
 
 Status GpuSparse::Coo2csr(const int* cooRowInd, int nnz, int m,
                           int* csrRowPtr) const {
   DCHECK(initialized_);
-  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcoo2csr(*gpusparse_handle_, cooRowInd,
-                                                 nnz, m, csrRowPtr,
-                                                 HIPSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      wrap::hipsparseXcoo2csr(*gpusparse_handle_, cooRowInd, nnz, m, csrRowPtr,
+                              HIPSPARSE_INDEX_BASE_ZERO));
   return Status::OK();
 }
 
 Status GpuSparse::Csr2coo(const int* csrRowPtr, int nnz, int m,
                           int* cooRowInd) const {
   DCHECK(initialized_);
-  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsr2coo(*gpusparse_handle_, csrRowPtr,
-                                                 nnz, m, cooRowInd,
-                                                 HIPSPARSE_INDEX_BASE_ZERO));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      wrap::hipsparseXcsr2coo(*gpusparse_handle_, csrRowPtr, nnz, m, cooRowInd,
+                              HIPSPARSE_INDEX_BASE_ZERO));
   return Status::OK();
 }
 
@@ -252,7 +253,7 @@ Status GpuSparse::CsrgemmNnz(
     int* csrSortedRowPtrC, int* nnzTotalDevHostPtr) {
   DCHECK(initialized_);
   DCHECK(nnzTotalDevHostPtr != nullptr);
-  TF_RETURN_IF_GPUSPARSE_ERROR(hipsparseXcsrgemmNnz(
+  TF_RETURN_IF_GPUSPARSE_ERROR(wrap::hipsparseXcsrgemmNnz(
       *gpusparse_handle_, transA, transB, m, n, k, descrA, nnzA,
       csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
       csrSortedColIndB, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr));
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 0df810abd00..1cf9a8cd013 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -59,6 +59,11 @@ struct StridedSliceDenseSpec {
   // is obtained from canonical end-begin. Otherwise, if it is a kNewAxis,
   // it will be 1. A shrunk dimension is skipped.
   gtl::InlinedVector<int32, 4> final_shape_gather_indices;
+  // This vector has the same size as final_shape_gather_indices, but it
+  // remembers the sparse index that a dimension comes from, instead of dense
+  // index. A -1 in this vector means there the index is not from the sparse
+  // input.
+  gtl::InlinedVector<int32, 4> final_shape_gather_indices_sparse;
   // The dense indexed shrink mask is which processing dimensions
   // should be shrunk. For example, if foo.shape = (10,10,10,10)
   // foo[3, ..., 5] has sparse_shrink_axis_mask of 0x5 and
@@ -108,9 +113,11 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
           dense->begin_mask |= (1 << full_index);
           dense->end_mask |= (1 << full_index);
           dense->final_shape_gather_indices.push_back(full_index);
+          dense->final_shape_gather_indices_sparse.push_back(-1);
         }
       } else if ((1 << i) & sparse.new_axis_mask) {
         dense->final_shape_gather_indices.push_back(kNewAxis);
+        dense->final_shape_gather_indices_sparse.push_back(-1);
       } else {
         if (full_index == dense->begin.size()) {
           return errors::InvalidArgument("Index out of range using input dim ",
@@ -138,9 +145,13 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
         // axis (now in dense form) so we can ignore dense->end below.
         if (sparse.shrink_axis_mask & (1 << i)) {
           dense->final_shape_gather_indices.push_back(kShrinkAxis);
+          dense->final_shape_gather_indices_sparse.push_back(-1);
           dense->shrink_axis_mask |= (1 << full_index);
         } else {
           dense->final_shape_gather_indices.push_back(full_index);
+          // Remember that where in the sparse shape the dense dim comes
+          // from.
+          dense->final_shape_gather_indices_sparse.push_back(i);
         }
         full_index++;
       }
@@ -157,7 +168,9 @@ Status ValidateStridedSliceOp(
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
     gtl::InlinedVector<int64, 4>* begin, gtl::InlinedVector<int64, 4>* end,
-    gtl::InlinedVector<int64, 4>* strides) {
+    gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping) {
   const bool begin_is_wrong =
       begin_tensor != nullptr &&
       !(TensorShapeUtils::IsVector(begin_tensor->shape()) &&
@@ -362,11 +375,34 @@ Status ValidateStridedSliceOp(
   // slices like foo[3,...] will reduce dimension by 1.
   // This cannot be done earlier, because it depends on Step 3.
   final_shape->Clear();
-  for (auto gather_index : dense_spec.final_shape_gather_indices) {
+  if (output_to_sparse_mapping != nullptr) {
+    output_to_sparse_mapping->clear();
+  }
+
+  if (output_to_processing_mapping != nullptr) {
+    output_to_processing_mapping->clear();
+  }
+  for (int64 dense_dim = 0;
+       dense_dim < dense_spec.final_shape_gather_indices.size(); ++dense_dim) {
+    int64 gather_index = dense_spec.final_shape_gather_indices[dense_dim];
+    int64 sparse_index =
+        dense_spec.final_shape_gather_indices_sparse[dense_dim];
     if (gather_index >= 0) {
       final_shape->AddDim(processing_shape->dim_size(gather_index));
+      if (output_to_sparse_mapping != nullptr) {
+        output_to_sparse_mapping->push_back(sparse_index);
+      }
+      if (output_to_processing_mapping != nullptr) {
+        output_to_processing_mapping->push_back(gather_index);
+      }
     } else if (gather_index == kNewAxis) {
       final_shape->AddDim(1);
+      if (output_to_sparse_mapping != nullptr) {
+        output_to_sparse_mapping->push_back(-1);
+      }
+      if (output_to_processing_mapping != nullptr) {
+        output_to_processing_mapping->push_back(-1);
+      }
     }
   }
   return Status::OK();
@@ -379,14 +415,17 @@ Status ValidateStridedSliceOp(
     int32 new_axis_mask, int32 shrink_axis_mask, TensorShape* processing_shape,
     TensorShape* final_shape, bool* is_identity, bool* is_simple_slice,
     bool* slice_dim0, gtl::InlinedVector<int64, 4>* begin,
-    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides) {
+    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping) {
   // Validate with PartialTensorShape output
   PartialTensorShape partial_processing_shape, partial_final_shape;
   TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
       begin_tensor, end_tensor, strides_tensor, input_shape, begin_mask_spec,
       end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
       &partial_processing_shape, &partial_final_shape, is_identity,
-      is_simple_slice, slice_dim0, begin, end, strides));
+      is_simple_slice, slice_dim0, begin, end, strides,
+      output_to_sparse_mapping, output_to_processing_mapping));
 
   // Verify that the output shapes are fully known
   if (!partial_processing_shape.AsTensorShape(processing_shape) ||
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index 25ecccd2855..9e49477a9c3 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -40,6 +40,17 @@ namespace tensorflow {
 // some dimensions of <processing_shape> and/or <final_shape> may be unknown
 // (-1). Any validation that can be done without complete information is
 // performed.
+//
+// This function changes the orders of dimensions, output_to_sparse_mapping and
+// output_to_processing_mapping are used to track the order change.
+//
+// output_to_sparse_mapping[i] represents output[i]'s the corresponding dim
+// index in the begin_tensor. If
+// output_to_sparse_mapping[i] is -1, it means the dimension doesn't show up in
+// sparse_mapping.
+//
+// output_to_processing_mapping is similar to output_to_sparse_mapping, but for
+// processing_shape.
 Status ValidateStridedSliceOp(
     const Tensor* begin_tensor, const Tensor* end_tensor,
     const Tensor& strides_tensor, const PartialTensorShape& input_shape,
@@ -48,7 +59,9 @@ Status ValidateStridedSliceOp(
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
     gtl::InlinedVector<int64, 4>* begin, gtl::InlinedVector<int64, 4>* end,
-    gtl::InlinedVector<int64, 4>* strides);
+    gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping = nullptr,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping = nullptr);
 
 // Same as above, but the outputs are TensorShape, not PartialTensorShape
 Status ValidateStridedSliceOp(
@@ -58,7 +71,9 @@ Status ValidateStridedSliceOp(
     int32 new_axis_mask, int32 shrink_axis_mask, TensorShape* processing_shape,
     TensorShape* final_shape, bool* is_identity, bool* is_simple_slice,
     bool* slice_dim0, gtl::InlinedVector<int64, 4>* begin,
-    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides);
+    gtl::InlinedVector<int64, 4>* end, gtl::InlinedVector<int64, 4>* strides,
+    gtl::InlinedVector<int64, 4>* output_to_sparse_mapping = nullptr,
+    gtl::InlinedVector<int64, 4>* output_to_processing_mapping = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index bb18000fcfe..c5aa2f1e8c9 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/framework/versions.h"
 #include "tensorflow/core/framework/versions.pb.h"
-#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -41,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index d0157f8ad37..442b3725db5 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -22,24 +22,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-bool CanUseCudnn() {
-  static bool is_enabled = [] {
-    bool is_enabled = true;
-    // TODO(b/155239286): Remove TF_USE_CUDNN after TF 2.3 is released.
-    Status status =
-        ReadBoolFromEnvVar("TF_USE_CUDNN", /*default_val=*/true, &is_enabled);
-    if (!status.ok()) {
-      LOG(ERROR) << status;
-    }
-    if (!is_enabled) {
-      LOG(WARNING) << "The environmental variable TF_USE_CUDNN is deprecated "
-                      "and will be ignored in the future";
-    }
-    return is_enabled;
-  }();
-  return is_enabled;
-}
-
 #define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
   bool func_name() {                                                       \
     bool value = default_value;                                            \
@@ -88,25 +70,6 @@ ADD_BOOL_CUDNN_FLAG(DebugCudnnRnnUseTensorOps,
 ADD_INT64_CUDNN_FLAG(DebugCudnnRnnAlgo, TF_DEBUG_CUDNN_RNN_ALGO, -1);
 #undef ADD_INT64_CUDNN_FLAG
 
-FP16ConvMode CudnnConvComputeMode() {
-  string value;
-  Status status = ReadStringFromEnvVar("TF_FP16_CONV_MODE", "accurate", &value);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-  }
-  string lowercase_value = absl::AsciiStrToLower(value);
-  if (lowercase_value == "accurate") {
-    return FP16ConvMode::kAccurate;
-  } else if (lowercase_value == "fast") {
-    return FP16ConvMode::kFast;
-  } else {
-    LOG(ERROR) << "FP16ConvMode only supports two modes, ACCURATE and FAST. "
-                  "Got unknown mode: "
-               << value;
-  }
-  return FP16ConvMode::kAccurate;
-}
-
 bool IsCudnnSupportedFilterSize(const int32 filter_rows,
                                 const int32 filter_cols, const int32 in_depth,
                                 const int32 out_depth) {
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index bbacd349daf..f59a6950269 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -22,20 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-// FP16ConvMode: The mode to set the internal compute type for cudnn convolution
-// when the input data type is float16. Two types of modes are supported:
-//   kAccurate: Always use float32 as the internal compute type.
-//   kFast: Include both float32 and float16 compute type in the autotune.
-enum class FP16ConvMode {
-  kAccurate = 1,
-  kFast = 2,
-};
-
-bool CanUseCudnn();
 bool CudnnUseAutotune();
 bool CudnnRnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
-FP16ConvMode CudnnConvComputeMode();
 bool DebugCudnnRnn();
 bool DebugCudnnRnnUseTensorOps();
 int64 DebugCudnnRnnAlgo();
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index ac28c3ac5bd..60de1e1a29e 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -495,3 +495,34 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 	}
 	return nil
 }
+
+type LibraryHandler struct {
+	cptr *C.TF_Library
+}
+
+// Load library content into current context, useful to load ops implementation into non-monolitic TF build. Returns LibraryHandler or nil and error
+func LoadLibrary(path string) (*LibraryHandler, error) {
+	status := newStatus()
+
+	cpath := C.CString(path)
+	defer C.free(unsafe.Pointer(cpath))
+	cptr := C.TF_LoadLibrary(cpath, status.c)
+	if cptr == nil || status.Code() != C.TF_OK {
+		return nil, fmt.Errorf("could not load library %s: code: %d, error: %s", path, status.Code(), status.String())
+	}
+
+	lh := &LibraryHandler{
+		cptr: cptr,
+	}
+
+	runtime.SetFinalizer(lh, (*LibraryHandler).free)
+	return lh, nil
+}
+
+func (lh *LibraryHandler) free() {
+	if lh == nil || lh.cptr == nil {
+		return
+	}
+
+	C.TF_DeleteLibraryHandle(lh.cptr)
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index fa3949f6be0..746fd76730c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -13634,6 +13634,33 @@ func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataTyp
 	return components
 }
 
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns:
+//	key: A scalar.
+//	value: A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Return a slice from 'input'.
 //
 // The output tensor is a tensor with dimensions described by 'size'
@@ -14984,9 +15011,9 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 // The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
 // form square matrices. The outputs are two tensors containing the signs and
 // absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
+// `[..., :, :]` such that `determinant = sign*exp(log_abs_determinant)`.
+// The `log_abs_determinant` is computed as `det(P)*sum(log(diag(LU)))` where `LU`
+// is the `LU` decomposition of the input and `P` is the corresponding
 // permutation matrix.
 //
 // Arguments:
@@ -15299,27 +15326,6 @@ func TensorMapHasKey(scope *Scope, input_handle tf.Output, key tf.Output) (has_k
 	return op.Output(0)
 }
 
-// Returns the value from a given key in a tensor map.
-//
-// input_handle: the input map
-// key: the key to be looked up
-// value: the value found from the given key
-func TensorMapLookup(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorMapLookup",
-		Input: []tf.Input{
-			input_handle, key,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse 3D fast Fourier transform.
 //
 // Computes the inverse 3-dimensional discrete Fourier transform over the
@@ -15368,6 +15374,27 @@ func TensorMapInsert(scope *Scope, input_handle tf.Output, key tf.Output, value
 	return op.Output(0)
 }
 
+// Returns the value from a given key in a tensor map.
+//
+// input_handle: the input map
+// key: the key to be looked up
+// value: the value found from the given key
+func TensorMapLookup(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapLookup",
+		Input: []tf.Input{
+			input_handle, key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Merges summaries.
 //
 // This op creates a
@@ -15927,6 +15954,46 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
+// IsotonicRegressionAttr is an optional argument to IsotonicRegression.
+type IsotonicRegressionAttr func(optionalAttr)
+
+// IsotonicRegressionOutputDtype sets the optional output_dtype attribute to value.
+//
+// value: Dtype of output.
+// If not specified, defaults to DT_FLOAT
+func IsotonicRegressionOutputDtype(value tf.DataType) IsotonicRegressionAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Solves a batch of isotonic regression problems.
+//
+// Arguments:
+//	input: A (batch_size, dim)-tensor holding a batch of inputs.
+//
+// Returns:
+//	output: A (batch_size, dim)-tensor holding the per-batch element solutions.
+//	segments: An int32 (batch_size, dim)-tensor with the segments.
+func IsotonicRegression(scope *Scope, input tf.Output, optional ...IsotonicRegressionAttr) (output tf.Output, segments tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IsotonicRegression",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Computes softplus: `log(exp(features) + 1)`.
 func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -20134,6 +20201,24 @@ func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Writes a graph summary.
+//
+// Writes TensorFlow graph `tensor` at `step` using summary `writer`.
+//
+// Returns the created operation.
+func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteGraphSummary",
+		Input: []tf.Input{
+			writer, step, tensor,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ApproximateEqualAttr is an optional argument to ApproximateEqual.
 type ApproximateEqualAttr func(optionalAttr)
 
@@ -20281,8 +20366,7 @@ func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 // input_handle: the original map
 // output_handle: the map with value from given key removed
 // key: the key of the value to be erased
-// value: the value that was erased
-func TensorMapErase(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (output_handle tf.Output, value tf.Output) {
+func TensorMapErase(scope *Scope, input_handle tf.Output, key tf.Output, value_dtype tf.DataType) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20295,7 +20379,7 @@ func TensorMapErase(scope *Scope, input_handle tf.Output, key tf.Output, value_d
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
 // Shuffle dimensions of x according to a permutation.
@@ -20761,6 +20845,24 @@ func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Writes a serialized proto summary.
+//
+// Writes `tensor`, a serialized proto at `step` using summary `writer`.
+//
+// Returns the created operation.
+func WriteRawProtoSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteRawProtoSummary",
+		Input: []tf.Input{
+			writer, step, tensor,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns 0 if the denominator is zero.
 //
 //
@@ -22968,6 +23070,26 @@ func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Outp
 	return op.Output(0)
 }
 
+// An op to receive a tensor from the host.
+//
+// output: the tensor that will be received from the host.
+// Toutput: element type for output.
+// shape: shape for output.
+// key: A unique identifier for this region used to match up host transfers.
+func XlaRecvFromHost(scope *Scope, Toutput tf.DataType, shape tf.Shape, key string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutput": Toutput, "shape": shape, "key": key}
+	opspec := tf.OpSpec{
+		Type: "XlaRecvFromHost",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
 type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
 
@@ -24206,9 +24328,28 @@ func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	}
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
+// Permute input tensor from `src_format` to `dst_format`.
 //
-// one in the source data format.
+// Input tensor must be a vector of size 4, or a 4x2 tensor.
+//
+// For example, with `src_format` of `NHWC`, `dst_format` of `NCHW`, and inputs:
+// ```
+// [1, 2, 3, 4]
+// ```
+// and
+// ```
+// [[1, 2, 3, 4],
+//  [5, 6, 7, 8]]
+// ```
+// , the outputs will be (respectively):
+// ```
+// [1, 4, 2, 3]
+// ```
+// and
+// ```
+// [[1, 4, 2, 3],
+//  [5, 8, 6, 7]]
+// ```
 //
 // Arguments:
 //	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
@@ -28650,6 +28791,43 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
+// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
+type WriteAudioSummaryAttr func(optionalAttr)
+
+// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Writes an audio summary.
+//
+// Writes encoded audio summary `tensor` at `step` with `tag` using summary `writer`.
+// `sample_rate` is the audio sample rate is Hz.
+//
+// Returns the created operation.
+func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteAudioSummary",
+		Input: []tf.Input{
+			writer, step, tag, tensor, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Outputs a `Summary` protocol buffer with a histogram.
 //
 // The generated
@@ -33205,6 +33383,24 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
+// Writes a histogram summary.
+//
+// Writes histogram `values` at `step` with `tag` using summary `writer`.
+//
+// Returns the created operation.
+func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteHistogramSummary",
+		Input: []tf.Input{
+			writer, step, tag, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes tan of x element-wise.
 //
 //   Given an input tensor, this function computes tangent of every
@@ -34075,6 +34271,24 @@ func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (fi
 	return op.Output(0)
 }
 
+// Writes a scalar summary.
+//
+// Writes scalar `value` at `step` with `tag` using summary `writer`.
+//
+// Returns the created operation.
+func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteScalarSummary",
+		Input: []tf.Input{
+			writer, step, tag, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
 type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
 
@@ -41188,6 +41402,43 @@ func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Oper
 	return scope.AddOperation(opspec)
 }
 
+// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
+type WriteImageSummaryAttr func(optionalAttr)
+
+// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// Writes an image summary.
+//
+// Writes image `tensor` at `step` with `tag` using summary `writer`.
+// `tensor` is image with shape [height, width, channels].
+//
+// Returns the created operation.
+func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteImageSummary",
+		Input: []tf.Input{
+			writer, step, tag, tensor, bad_color,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // MatrixSolveAttr is an optional argument to MatrixSolve.
 type MatrixSolveAttr func(optionalAttr)
 
@@ -41719,6 +41970,24 @@ func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_ba
 	return batched_tensors, batch_index, id
 }
 
+// Writes a tensor summary.
+//
+// Writes `tensor` at `step` with `tag` using summary `writer`.
+//
+// Returns the created operation.
+func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteSummary",
+		Input: []tf.Input{
+			writer, step, tensor, tag, summary_metadata,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UnicodeDecodeAttr is an optional argument to UnicodeDecode.
 type UnicodeDecodeAttr func(optionalAttr)
 
@@ -45861,6 +46130,28 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
+// An op to send a tensor to the host.
+//
+// input: the tensor that will be sent to the host.
+// Tinput: element type for input.
+// key: A unique identifier for this region used to match up host transfers.
+//
+// Returns the created operation.
+func XlaSendToHost(scope *Scope, input tf.Output, key string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key": key}
+	opspec := tf.OpSpec{
+		Type: "XlaSendToHost",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
 type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
@@ -49482,33 +49773,6 @@ func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns:
-//	key: A scalar.
-//	value: A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // CumprodAttr is an optional argument to Cumprod.
 type CumprodAttr func(optionalAttr)
 
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 9221d35274c..d9036ced325 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -83,7 +83,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 		return nil, err
 	}
 	nflattened := numElements(shape)
-	nbytes := typeOf(dataType, nil).Size() * uintptr(nflattened)
+	nbytes := TypeOf(dataType, nil).Size() * uintptr(nflattened)
 	if dataType == String {
 		nbytes = uintptr(nflattened) * C.sizeof_TF_TString
 	}
@@ -168,7 +168,7 @@ func ReadTensor(dataType DataType, shape []int64, r io.Reader) (*Tensor, error)
 	if err := isTensorSerializable(dataType); err != nil {
 		return nil, err
 	}
-	nbytes := typeOf(dataType, nil).Size() * uintptr(numElements(shape))
+	nbytes := TypeOf(dataType, nil).Size() * uintptr(numElements(shape))
 	var shapePtr *C.int64_t
 	if len(shape) > 0 {
 		shapePtr = (*C.int64_t)(unsafe.Pointer(&shape[0]))
@@ -207,6 +207,28 @@ func (t *Tensor) DataType() DataType { return DataType(C.TF_TensorType(t.c)) }
 // Shape returns the shape of the Tensor.
 func (t *Tensor) Shape() []int64 { return t.shape }
 
+// Reshape  updates tensor's shape in place if this is possible or returns an error otherwise.
+func (t *Tensor) Reshape(new_shape []int64) error {
+	old_shape_size := numElements(t.shape)
+	new_shape_size := numElements(new_shape)
+
+	if old_shape_size != new_shape_size {
+		return fmt.Errorf("unable to convert shape %v (num_elements: %d) into shape %v (num_elements: %d)", t.shape, old_shape_size, new_shape, new_shape_size)
+	}
+
+	if len(new_shape) == 0 {
+		return nil
+	}
+
+	var shapePtr *C.int64_t
+	shapePtr = (*C.int64_t)(unsafe.Pointer(&new_shape[0]))
+
+	status := newStatus()
+	C.TF_TensorBitcastFrom(t.c, C.TF_TensorType(t.c), t.c, shapePtr, C.int(len(new_shape)), status.c)
+
+	return status.Err()
+}
+
 // Value converts the Tensor to a Go value. For now, not all Tensor types are
 // supported, and this function may panic if it encounters an unsupported
 // DataType.
@@ -407,8 +429,8 @@ func typeForDataType(dt DataType) reflect.Type {
 	panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
 }
 
-// typeOf converts from a DataType and Shape to the equivalent Go type.
-func typeOf(dt DataType, shape []int64) reflect.Type {
+// TypeOf converts from a DataType and Shape to the equivalent Go type.
+func TypeOf(dt DataType, shape []int64) reflect.Type {
 	ret := typeForDataType(dt)
 	for range shape {
 		ret = reflect.SliceOf(ret)
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 7007a847d83..e80e32fe6cf 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -55,6 +55,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "tf_lite_static_memory",
+    values = {
+        "copt": "-DTF_LITE_STATIC_MEMORY",
+        "cpu": "k8",
+    },
+)
+
 TFLITE_DEFAULT_COPTS = if_not_windows([
     "-Wall",
     "-Wno-comment",
@@ -246,7 +254,6 @@ cc_library(
         ":shared_library",
         ":simple_memory_arena",
         ":string",
-        ":tflite_with_xnnpack_optional",
         ":type_to_tflitetype",
         ":util",
         ":version",
@@ -616,7 +623,14 @@ cc_library(
 
 cc_library(
     name = "type_to_tflitetype",
-    hdrs = ["type_to_tflitetype.h"],
+    hdrs = [
+        "portable_type_to_tflitetype.h",
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "type_to_tflitetype.h",
+        ],
+    }),
     deps = ["//tensorflow/lite/c:common"],
 )
 
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
new file mode 100644
index 00000000000..cfd8ebfc141
--- /dev/null
+++ b/tensorflow/lite/CMakeLists.txt
@@ -0,0 +1,341 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Builds the Tensorflow Lite runtime.
+#
+# WARNING: This is an experimental that is subject to change.
+# This has only been tested on Windows, Linux and macOS.
+#
+# The following are not currently supported:
+# - GPU acceleration
+# - Android
+# - iOS
+# - Micro backend
+# - Tests
+# - Many features in experimental
+# - Host Tools (i.e conversion / analysis tools etc.)
+
+cmake_minimum_required(VERSION 3.16)
+# Double colon in target name means ALIAS or IMPORTED target.
+cmake_policy(SET CMP0028 NEW)
+# Enable MACOSX_RPATH (@rpath) for built dynamic libraries.
+cmake_policy(SET CMP0042 NEW)
+project(tensorflow-lite C CXX)
+set(TENSORFLOW_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the TensorFlow project"
+)
+if(NOT TENSORFLOW_SOURCE_DIR)
+  set(TENSORFLOW_SOURCE_DIR "${CMAKE_SOURCE_DIR}/../../")
+endif()
+set(TF_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow")
+set(TFLITE_SOURCE_DIR "${CMAKE_SOURCE_DIR}")
+set(CMAKE_MODULE_PATH "${TFLITE_SOURCE_DIR}/tools/cmake/modules" ${CMAKE_MODULE_PATH})
+set(CMAKE_PREFIX_PATH "${TFLITE_SOURCE_DIR}/tools/cmake/modules" ${CMAKE_PREFIX_PATH})
+
+option(TFLITE_ENABLE_RUY "Enable experimental RUY integration" OFF)
+option(TFLITE_ENABLE_RESOURCE "Enable experimental support for resources" ON)
+option(TFLITE_ENABLE_NNAPI "Enable NNAPI (Android only)." ON)
+option(TFLITE_ENABLE_MMAP "Enable MMAP (unsupported on Windows)" ON)
+option(TFLITE_ENABLE_GPU "Enable GPU (not supported)" OFF)
+# This must be enabled when converting from TF models with SELECT_TF_OPS
+# enabled.
+# https://www.tensorflow.org/lite/guide/ops_select#converting_the_model
+# This is currently not supported.
+option(TFLITE_ENABLE_FLEX "Enable SELECT_TF_OPS" OFF) # TODO: Add support
+option(TFLITE_ENABLE_XNNPACK "Enable XNNPACK backend" OFF) # TODO: Add XNNPACK
+option(TFLITE_ENABLE_PROFILING "Enable profiling" OFF)
+set(CMAKE_CXX_STANDARD 14)  # Some components require C++14.
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(_TFLITE_ENABLE_NNAPI "${TFLITE_ENABLE_NNAPI}")
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+  set(_TFLITE_ENABLE_NNAPI OFF)
+endif()
+set(_TFLITE_ENABLE_MMAP "${TFLITE_ENABLE_MMAP}")
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+  # See https://github.com/tensorflow/tensorflow/blob/\
+  # 2b96f3662bd776e277f86997659e61046b56c315/tensorflow/lite/tools/make/\
+  # Makefile#L157
+  set(_TFLITE_ENABLE_MMAP OFF)
+endif()
+# Simplifies inclusion of non-test sources and headers from a directory.
+# SOURCE_DIR: Directory to search for files.
+# SOURCES_VAR: Variable to append with all matching *.cc and *.h files.
+# [FILTER expression0 .. expressionN]:
+#   Additional regular expressions to filter the set of matching
+#   files. By default, all files ending in "(_test|test_util)\\.(cc|h)" are
+#   removed.
+# [RECURSE]: Whether to recursively search SOURCE_DIR.
+macro(populate_source_vars SOURCE_DIR SOURCES_VAR)
+  cmake_parse_arguments(ARGS "RECURSE" "" "FILTER" ${ARGN})
+  if(ARGS_RECURSE)
+    set(GLOB_OP GLOB_RECURSE)
+  else()
+    set(GLOB_OP GLOB)
+  endif()
+  set(DEFAULT_FILE_FILTER ".*(_test|test_util)\\.(c|cc|h)$")
+  file(${GLOB_OP} FOUND_SOURCES "${SOURCE_DIR}/*.*")
+  list(FILTER FOUND_SOURCES INCLUDE REGEX ".*\\.(c|cc|h)$")
+  list(FILTER FOUND_SOURCES EXCLUDE REGEX "${DEFAULT_FILE_FILTER}")
+  foreach(FILE_FILTER ${ARGS_FILTER})
+    list(FILTER FOUND_SOURCES EXCLUDE REGEX "${FILE_FILTER}")
+  endforeach()
+  list(APPEND ${SOURCES_VAR} ${FOUND_SOURCES})
+endmacro()
+# Simplifies inclusion of non-test sources and headers from a directory
+# relative to TFLITE_SOURCE_DIR. See populate_source_vars() for the
+# description of arguments including and following SOURCES_VAR.
+macro(populate_tflite_source_vars RELATIVE_DIR SOURCES_VAR)
+  populate_source_vars(
+    "${TFLITE_SOURCE_DIR}/${RELATIVE_DIR}" ${SOURCES_VAR} ${ARGN}
+  )
+endmacro()
+# Simplifies inclusion of non-test sources and headers from a directory
+# relative to TF_SOURCE_DIR. See populate_source_vars() for the description of
+# arguments including and following SOURCES_VAR.
+macro(populate_tf_source_vars RELATIVE_DIR SOURCES_VAR)
+  populate_source_vars(
+    "${TF_SOURCE_DIR}/${RELATIVE_DIR}" ${SOURCES_VAR} ${ARGN}
+  )
+endmacro()
+# Find TensorFlow Lite dependencies.
+find_package(absl REQUIRED CONFIG)
+find_package(eigen REQUIRED)
+find_package(farmhash REQUIRED)
+find_package(fft2d REQUIRED)
+find_package(flatbuffers REQUIRED)
+find_package(gemmlowp REQUIRED)
+find_package(neon2sse REQUIRED)
+find_package(ruy REQUIRED)
+# Generate TensorFlow Lite FlatBuffer code.
+# This is not currently neccessary since the generated code is checked into
+# the repository but it would likely be preferable to do this in future.
+# NOTE: This will not work for cross compilation (e.g for iOS, Android etc.)
+# as flatc needs to be compiled with the host toolchain and this currently
+# builds with the target toolchain. Instead this should recursively call
+# cmake with the default host toolchain to build flatc.
+set(TFLITE_FLATBUFFERS_SCHEMAS "${TFLITE_SOURCE_DIR}/schema/schema.fbs")
+set(TFLITE_FLATBUFFERS_GEN_DIR
+  "${CMAKE_BINARY_DIR}/flatbuffers_generated/"
+)
+set(TFLITE_FLATBUFFERS_HDRS "")
+foreach(INPUT_SCHEMA ${TFLITE_FLATBUFFERS_SCHEMAS})
+  file(RELATIVE_PATH FILENAME "${TENSORFLOW_SOURCE_DIR}" "${INPUT_SCHEMA}")
+  get_filename_component(OUTPUT_DIR
+    "${TFLITE_FLATBUFFERS_GEN_DIR}/${FILENAME}" DIRECTORY
+  )
+  get_filename_component(OUTPUT_BASENAME
+    "${FILENAME}" NAME_WE
+  )
+  set(OUTPUT_FILENAME "${OUTPUT_DIR}/${OUTPUT_BASENAME}_generated.h")
+  list(APPEND TFLITE_FLATBUFFERS_HDRS "${OUTPUT_FILENAME}")
+  add_custom_command(
+    OUTPUT "${OUTPUT_FILENAME}"
+    COMMAND flatc
+      --cpp
+      --gen-mutable
+      --gen-object-api
+      --reflect-names
+      -I "${TENSORFLOW_SOURCE_DIR}"
+      -o "${OUTPUT_DIR}"
+      "${INPUT_SCHEMA}"
+    DEPENDS
+      "${INPUT_SCHEMA}")
+endforeach()
+set(TF_TARGET_PRIVATE_OPTIONS "")
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
+  # TensorFlow uses a heap of deprecated proto fields so surpress these
+  # warnings until they're fixed.
+  list(APPEND TF_TARGET_PRIVATE_OPTIONS "-Wno-deprecated-declarations")
+endif()
+# Additional compiler flags used when compiling TF Lite.
+set(TFLITE_TARGET_PUBLIC_OPTIONS "")
+set(TFLITE_TARGET_PRIVATE_OPTIONS "")
+# Additional library dependencies based upon enabled features.
+set(TFLITE_TARGET_DEPENDENCIES "")
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
+  # TFLite uses deprecated methods in neon2sse which generates a huge number of
+  # warnings so surpress these until they're fixed.
+  list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-Wno-deprecated-declarations")
+endif()
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+  # Use NOMINMAX to disable the min / max macros in windows.h as they break
+  # use of std::min std::max.
+  # Use NOGDI to ERROR macro which breaks TensorFlow logging.
+  list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-DNOMINMAX" "-DNOGDI")
+endif()
+# Build a list of source files to compile into the TF Lite library.
+populate_tflite_source_vars("." TFLITE_SRCS)
+if(_TFLITE_ENABLE_MMAP)
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*mmap_allocation_disabled\\.cc$")
+else()
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*mmap_allocation\\.cc$")
+endif()
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_android\\.cc$")
+endif()
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS")
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*minimal_logging_ios\\.cc$")
+endif()
+populate_tflite_source_vars("core" TFLITE_CORE_SRCS)
+populate_tflite_source_vars("core/api" TFLITE_CORE_API_SRCS)
+populate_tflite_source_vars("c" TFLITE_C_SRCS)
+populate_tflite_source_vars("delegates" TFLITE_DELEGATES_SRCS)
+if(TFLITE_ENABLE_FLEX)
+  message(FATAL_ERROR "TF Lite Flex delegate is currently not supported.")
+  populate_tflite_source_vars("delegates/flex" TFLITE_DELEGATES_FLEX_SRCS)
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    absl::inlined_vector
+    absl::optional
+    absl::type_traits
+  )
+endif()
+if(TFLITE_ENABLE_GPU)
+  # Implementation is under delegates/gpu.
+  message(FATAL_ERROR
+    "GPU acceleration is not currently supported in CMake builds"
+  )
+endif()
+if(_TFLITE_ENABLE_NNAPI)
+  populate_tflite_source_vars("delegates/nnapi"
+    TFLITE_DELEGATES_NNAPI_SRCS
+    FILTER "(_test_list|_disabled)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "nnapi" TFLITE_NNAPI_SRCS FILTER "(_disabled)\\.(cc|h)$"
+  )
+else()
+  set(TFLITE_DELEGATES_NNAPI_SRCS
+    "${TFLITE_SOURCE_DIR}/delegates/nnapi/nnapi_delegate_disabled.cc"
+  )
+  set(TFLITE_NNAPI_SRCS
+    "${TFLITE_SOURCE_DIR}/nnapi/nnapi_implementation_disabled.cc"
+  )
+endif()
+if(TFLITE_ENABLE_XNNPACK)
+  populate_tflite_source_vars("delegates/xnnpack"
+    TFLITE_DELEGATES_XNNPACK_SRCS
+  )
+endif()
+if (TFLITE_ENABLE_RESOURCE)
+  populate_tflite_source_vars("experimental/resource"
+    TFLITE_EXPERIMENTAL_RESOURCE_SRCS
+  )
+endif()
+populate_tflite_source_vars("experimental/ruy"
+  TFLITE_EXPERIMENTAL_RUY_SRCS
+  FILTER
+  ".*(test(_fast|_slow|_special_specs))\\.(cc|h)$"
+  ".*(benchmark|tune_tool|example)\\.(cc|h)$"
+)
+populate_tflite_source_vars("experimental/ruy/profiler"
+  TFLITE_EXPERIMENTAL_RUY_PROFILER_SRCS
+  FILTER ".*(test|test_instrumented_library)\\.(cc|h)$"
+)
+if(TFLITE_ENABLE_RUY)
+  list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DTFLITE_WITH_RUY")
+endif()
+populate_tflite_source_vars("kernels"
+  TFLITE_KERNEL_SRCS
+  FILTER ".*(_test_util_internal|test_main)\\.(cc|h)"
+)
+populate_tflite_source_vars("kernels/internal" TFLITE_KERNEL_INTERNAL_SRCS)
+populate_tflite_source_vars("kernels/internal/optimized"
+  TFLITE_KERNEL_INTERNAL_OPT_SRCS
+)
+populate_tflite_source_vars("kernels/internal/optimized/integer_ops"
+  TFLITE_KERNEL_INTERNAL_OPT_INTEGER_OPS_SRCS
+)
+populate_tflite_source_vars("kernels/internal/optimized/sparse_ops"
+  TFLITE_KERNEL_INTERNAL_OPT_SPARSE_OPS_SRCS
+)
+populate_tflite_source_vars("kernels/internal/reference"
+  TFLITE_KERNEL_INTERNAL_REF_SRCS
+)
+populate_tflite_source_vars("kernels/internal/reference/integer_ops"
+  TFLITE_KERNEL_INTERNAL_REF_INTEGER_OPS_SRCS
+)
+populate_tflite_source_vars("kernels/internal/reference/sparse_ops"
+  TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS
+)
+if(TFLITE_ENABLE_PROFILING)
+  populate_tflite_source_vars("profiling" TFLITE_KERNEL_PROFILING_SRCS)
+endif()
+populate_tflite_source_vars("tools/optimize" TFLITE_TOOLS_OPTIMIZE_SRCS)
+populate_tflite_source_vars("tools/optimize/calibration"
+  TFLITE_TOOLS_OPTIMIZE_CALIBRATION_SRCS
+)
+populate_tflite_source_vars("tools/optimize/calibration/builtin_logging_ops"
+  TFLITE_TOOLS_OPTIMIZE_CALIBRATION_OPS_SRCS
+)
+populate_tflite_source_vars("tools/optimize/sparsity"
+  TFLITE_TOOLS_OPTIMIZE_SPARSITY_SRCS
+)
+add_library(tensorflowlite
+  ${TFLITE_CORE_API_SRCS}
+  ${TFLITE_CORE_SRCS}
+  ${TFLITE_C_SRCS}
+  ${TFLITE_DELEGATES_FLEX_SRCS}
+  ${TFLITE_DELEGATES_NNAPI_SRCS}
+  ${TFLITE_DELEGATES_SRCS}
+  ${TFLITE_DELEGATES_XNNPACK_SRCS}
+  ${TFLITE_EXPERIMENTAL_RESOURCE_SRCS}
+  ${TFLITE_EXPERIMENTAL_RUY_PROFILER_SRCS}
+  ${TFLITE_EXPERIMENTAL_RUY_SRCS}
+  ${TFLITE_FLATBUFFERS_HDRS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_INTEGER_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_SPARSE_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_REF_INTEGER_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_REF_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_SRCS}
+  ${TFLITE_KERNEL_PROFILING_SRCS}
+  ${TFLITE_KERNEL_SRCS}
+  ${TFLITE_NNAPI_SRCS}
+  ${TFLITE_SRCS}
+  ${TFLITE_TOOLS_OPTIMIZE_CALIBRATION_OPS_SRCS}
+  ${TFLITE_TOOLS_OPTIMIZE_CALIBRATION_SRCS}
+  ${TFLITE_TOOLS_OPTIMIZE_SPARSITY_SRCS}
+  ${TFLITE_TOOLS_OPTIMIZE_SRCS}
+)
+target_link_libraries(tensorflowlite
+  PUBLIC
+    Eigen3::Eigen
+    NEON_2_SSE
+    absl::flags
+    absl::hash
+    absl::status
+    absl::strings
+    absl::synchronization
+    absl::variant
+    farmhash
+    fft2d_fftsg2d
+    flatbuffers
+    gemmlowp
+    ruy
+    ${TFLITE_TARGET_DEPENDENCIES}
+)
+target_include_directories(tensorflowlite
+  PUBLIC
+   "${TENSORFLOW_SOURCE_DIR}"
+  PRIVATE
+    "${TFLITE_FLATBUFFERS_GEN_DIR}"
+)
+target_compile_options(tensorflowlite
+  PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
+  PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
+)
+add_library(tensorflow::tensorflowlite ALIAS tensorflowlite)
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 4de0be7c3fa..bdddac82d5b 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -584,7 +584,7 @@ def gen_zip_test(
         conversion_mode,
         test_tags,
         test_args,
-        additional_test_args = {},
+        additional_test_tags_args = {},
         **kwargs):
     """Generate a zipped-example test and its dependent zip files.
 
@@ -595,9 +595,11 @@ def gen_zip_test(
         list above.
       test_tags: tags for the generated cc_test.
       test_args: the basic cc_test args to be used.
-      additional_test_args: a dictionary of additional args to be used together
-        with test_args. The key is an identifier to be used in test tag, and
-        the value is a list of additional test args to be used.
+      additional_test_tags_args: a dictionary of additional test tags and args
+        to be used together with test_tags and test_args. The key is an
+        identifier which can be in creating a test tag to identify a set of
+        tests. The value is a tuple of list of additional test tags and args to
+        be used.
       **kwargs: tf_cc_test kwargs
     """
     toco = "//tensorflow/lite/toco:toco"
@@ -621,11 +623,13 @@ def gen_zip_test(
         tags = test_tags + ["gen_zip_test"],
         **kwargs
     )
-    for key, value in additional_test_args.items():
+    for key, value in additional_test_tags_args.items():
+        extra_tags, extra_args = value
+        extra_tags.append("gen_zip_test_%s" % key)
         tf_cc_test(
             name = "%s_%s" % (name, key),
-            args = test_args + value,
-            tags = test_tags + ["gen_zip_test_%s" % key],
+            args = test_args + extra_args,
+            tags = test_tags + extra_tags,
             **kwargs
         )
 
@@ -643,7 +647,8 @@ def gen_zipped_test_file(name, file, toco, flags):
         cmd = (("$(locations :generate_examples) --toco $(locations {0}) " +
                 " --zip_to_output {1} {2} $(@D)").format(toco, file, flags)),
         outs = [file],
-        tools = [
+        # `exec_tools` is required for PY3 compatibility in place of `tools`.
+        exec_tools = [
             ":generate_examples",
             toco,
         ],
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 23eb528f4c9..d320a90d005 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -47,7 +47,8 @@ extern "C" {
 typedef enum TfLiteStatus {
   kTfLiteOk = 0,
   kTfLiteError = 1,
-  kTfLiteDelegateError = 2
+  kTfLiteDelegateError = 2,
+  kTfLiteApplicationError = 3
 } TfLiteStatus;
 
 // The list of external context types known to TF Lite. This list exists solely
@@ -88,7 +89,7 @@ typedef struct TfLiteIntArray {
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
 #if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
      __GNUC_MINOR__ >= 1) ||                                      \
-    defined(HEXAGON)
+    defined(HEXAGON) || (__clang_major__ == 7 && __clang_minor__ == 1)
   int data[0];
 #else
   int data[];
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 7fb04f5b89e..5d2936f3636 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -633,10 +633,14 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       TF_LITE_ENSURE(error_reporter, params != nullptr);
       if (const auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
         const auto* squeeze_dims = schema_params->squeeze_dims();
-        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
-            sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
-            error_reporter, "squeeze"));
-        params->num_squeeze_dims = squeeze_dims->size();
+        if (squeeze_dims != nullptr) {
+          TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+              sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
+              error_reporter, "squeeze"));
+          params->num_squeeze_dims = squeeze_dims->size();
+        } else {
+          params->num_squeeze_dims = 0;
+        }
       }
       *builtin_data = params.release();
       return kTfLiteOk;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 89ca3f566ec..e8be9480aa5 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -82,15 +82,12 @@ class FlatbufferConversionsTest : public ::testing::Test {
   flatbuffers::FlatBufferBuilder builder_;
 };
 
-TEST_F(FlatbufferConversionsTest, ParseBadSqueeze) {
+TEST_F(FlatbufferConversionsTest, ParseSqueezeAll) {
   const Operator* op = BuildTestOperator(
       BuiltinOptions_SqueezeOptions, CreateSqueezeOptions(builder_).Union());
   void* output_data = nullptr;
-  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
+  EXPECT_EQ(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
                                    &mock_allocator_, &output_data));
-  EXPECT_THAT(mock_reporter_.GetAsString(),
-              ::testing::ContainsRegex(
-                  "Input array not provided for operation 'squeeze'"));
 }
 
 TEST_F(FlatbufferConversionsTest, ParseDynamicReshape) {
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index 1294b7b8ea8..b6a8171d2a3 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 
+#include <vector>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -32,6 +34,16 @@ class OpResolver {
   /// Finds the op registration of a custom operator by op name.
   virtual const TfLiteRegistration* FindOp(const char* op,
                                            int version) const = 0;
+
+  // Returns optional delegates for resolving and handling ops in the flatbuffer
+  // model. This may be used in addition to the standard TfLiteRegistration
+  // lookup for graph resolution.
+  using TfLiteDelegatePtrVector =
+      std::vector<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>>;
+  virtual TfLiteDelegatePtrVector GetDelegates(int num_threads) const {
+    return TfLiteDelegatePtrVector();
+  }
+
   virtual ~OpResolver() {}
 };
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 15b8a0bcc57..ecdb04c8b3c 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1414,7 +1414,7 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
         "ModifyGraphWithDelegate is disallowed when graph is immutable.");
-    return kTfLiteError;
+    return kTfLiteApplicationError;
   }
 
   if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 1fe1c7e4391..3a28b4cb99c 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -558,12 +558,15 @@ class Subgraph {
   // be reallocated if the graph was modified (i.e., the caller does *not* need
   // to explicitly call |AllocateTensors()| again). If tensors were unallocated,
   // they will remain unallocated after delegate application.
-  // Returns one of the following three status codes:
+  // Returns one of the following status codes:
   // 1. kTfLiteOk: Delegation succeeded
-  // 2. kTfLiteDelegateError: Delegation failed due to an error in the
-  // delegate. The Subgraph has been restored to its pre-delegation state.
+  // 2. kTfLiteDelegateError: Delegation failed due to an error *in the
+  // delegate*. The Subgraph has been restored to its pre-delegation state.
   // NOTE: This reverts all delegates previously applied to the Subgraph.
-  // 3. kTfLiteError: Unexpected/runtime failure.
+  // 3. kTfLiteApplicationError : Delegation failed to be applied due to the
+  // state that the TfLite runtime is in. However, the Subgraph is still in a
+  // invokable state.
+  // 4. kTfLiteError: Unexpected/runtime failure.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
   // This un-applies all delegates that have been applied till now, but retains
diff --git a/tensorflow/lite/delegates/external/README.md b/tensorflow/lite/delegates/external/README.md
index d110dede5b7..01945181e06 100644
--- a/tensorflow/lite/delegates/external/README.md
+++ b/tensorflow/lite/delegates/external/README.md
@@ -23,7 +23,7 @@ is allowed.
 void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate)
 ```
 
-The external delegate provides an opague and transparent way to utilize a
+The external delegate provides an opaque and transparent way to utilize a
 Tensorflow Lite delegate when performing inference. In other words, one may
 replace the actual Tensorflow Lite delegate by simply updating the dynamic
 library without changing the application code. We developed this mainly for
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index d6076e221bd..9ae3836d6c4 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -353,6 +353,7 @@ cc_library(
     srcs = ["gpu_object.cc"],
     hdrs = ["gpu_object.h"],
     deps = [
+        ":cl_context",
         ":opencl_wrapper",
         "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
@@ -399,11 +400,9 @@ cc_library(
     srcs = ["linear_storage.cc"],
     hdrs = ["linear_storage.h"],
     deps = [
-        ":buffer",
         ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
-        ":texture2d",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 2a3c84a67cf..01d32aa9206 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -196,8 +196,8 @@ class DefaultTensorTie : public TensorTie {
             ToTensorStorageType(d.object_def.object_type,
                                 d.object_def.data_layout),
             Layout::BHWC};
-        RETURN_IF_ERROR(AllocateTensorMemory(env->context(), env->device(),
-                                             shape, desc, &cl_memory_));
+        RETURN_IF_ERROR(
+            AllocateTensorMemory(env->context(), shape, desc, &cl_memory_));
         if (d.object_def.object_type == ObjectType::OPENCL_TEXTURE) {
           external_obj_ = OpenClTexture{cl_memory_.memory()};
         } else {
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 8db58e5e81b..5623de2419c 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -263,6 +263,12 @@ void Arguments::AddObject(const std::string& name, AccessType access_type,
   objects_[name] = {std::move(object), std::move(descriptor_ptr)};
 }
 
+void Arguments::AddObject(const std::string& name,
+                          GPUObjectDescriptorPtr&& descriptor_ptr) {
+  descriptor_ptr->SetAccess(AccessType::READ);
+  objects_[name] = {nullptr, std::move(descriptor_ptr)};
+}
+
 void Arguments::AddGPUResources(const std::string& name,
                                 const GPUResources& resources) {
   for (const auto& r : resources.ints) {
@@ -840,6 +846,15 @@ absl::Status Arguments::ResolveSelectorsPass(
   return absl::OkStatus();
 }
 
+absl::Status Arguments::AllocateObjects(CLContext* context) {
+  for (auto& t : objects_) {
+    RETURN_IF_ERROR(
+        t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
+    t.second.descriptor->Release();
+  }
+  return absl::OkStatus();
+}
+
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 0648ae43101..643e1b7655d 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -54,6 +54,8 @@ class Arguments {
   void AddObject(const std::string& name, AccessType access_type,
                  GPUObjectPtr&& object,
                  GPUObjectDescriptorPtr&& descriptor_ptr);
+  void AddObject(const std::string& name,
+                 GPUObjectDescriptorPtr&& descriptor_ptr);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
@@ -73,6 +75,7 @@ class Arguments {
   void RenameArgs(const std::string& postfix, std::string* code) const;
   absl::Status Merge(Arguments&& args, const std::string& postfix);
 
+  absl::Status AllocateObjects(CLContext* context);
   absl::Status TransformToCLCode(
       const DeviceInfo& device_info,
       const std::map<std::string, std::string>& linkables, std::string* code);
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 31770fca47e..340c2a7f9ac 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -28,25 +28,40 @@ namespace {
 absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
                           const void* data, CLContext* context,
                           Buffer* result) {
-  cl_mem_flags flags = gpu_read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
-  if (data != nullptr) {
-    flags |= CL_MEM_COPY_HOST_PTR;
-  }
-  cl_int error_code;
-  cl_mem buffer = clCreateBuffer(context->context(), flags, size_in_bytes,
-                                 const_cast<void*>(data), &error_code);
-  if (!buffer) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
-                     CLErrorCodeToString(error_code)));
-  }
-
+  cl_mem buffer;
+  RETURN_IF_ERROR(CreateCLBuffer(context->context(), size_in_bytes,
+                                 gpu_read_only, const_cast<void*>(data),
+                                 &buffer));
   *result = Buffer(buffer, size_in_bytes);
 
   return absl::OkStatus();
 }
 }  // namespace
 
+BufferDescriptor::BufferDescriptor(BufferDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      element_type(desc.element_type),
+      element_size(desc.element_size),
+      memory_type(desc.memory_type),
+      attributes(std::move(desc.attributes)),
+      size(desc.size),
+      data(std::move(desc.data)) {}
+
+BufferDescriptor& BufferDescriptor::operator=(BufferDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(element_type, desc.element_type);
+    std::swap(element_size, desc.element_size);
+    std::swap(memory_type, desc.memory_type);
+    attributes = std::move(desc.attributes);
+    std::swap(size, desc.size);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
+void BufferDescriptor::Release() { data.clear(); }
+
 GPUResources BufferDescriptor::GetGPUResources() const {
   GPUResources resources;
   GPUBufferDescriptor desc;
@@ -115,6 +130,14 @@ absl::Status BufferDescriptor::PerformGetPtrSelector(
   return absl::OkStatus();
 }
 
+absl::Status BufferDescriptor::CreateGPUObject(CLContext* context,
+                                               GPUObjectPtr* result) const {
+  Buffer gpu_buffer;
+  RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context));
+  *result = absl::make_unique<Buffer>(std::move(gpu_buffer));
+  return absl::OkStatus();
+}
+
 Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
     : buffer_(buffer), size_(size_in_bytes) {}
 
@@ -151,6 +174,17 @@ absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor* obj_ptr,
   return absl::OkStatus();
 }
 
+absl::Status Buffer::CreateFromBufferDescriptor(const BufferDescriptor& desc,
+                                                CLContext* context) {
+  bool read_only = desc.memory_type == MemoryType::CONSTANT;
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  size_ = desc.size;
+  return CreateCLBuffer(context->context(), desc.size, read_only, data_ptr,
+                        &buffer_);
+}
+
 absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
                                   Buffer* result) {
   return CreateBuffer(size_in_bytes, true, nullptr, context, result);
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index dbc43463bc7..60c48304e95 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -35,6 +35,16 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   MemoryType memory_type = MemoryType::GLOBAL;
   std::vector<std::string> attributes;
 
+  // optional
+  int size = 0;
+  std::vector<uint8_t> data;
+
+  BufferDescriptor() = default;
+  BufferDescriptor(const BufferDescriptor&) = default;
+  BufferDescriptor& operator=(const BufferDescriptor&) = default;
+  BufferDescriptor(BufferDescriptor&& desc);
+  BufferDescriptor& operator=(BufferDescriptor&& desc);
+
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -46,6 +56,10 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   absl::Status PerformGetPtrSelector(
       const std::vector<std::string>& args,
       const std::vector<std::string>& template_args, std::string* result) const;
+
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override;
 };
 
 // Buffer represent linear GPU data storage with arbitrary data format.
@@ -80,6 +94,9 @@ class Buffer : public GPUObject {
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
+  absl::Status CreateFromBufferDescriptor(const BufferDescriptor& desc,
+                                          CLContext* context);
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 16f5ce217e9..cce72174df8 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -244,6 +244,26 @@ DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) {
   info.max_work_group_size_x = max_work_group_sizes.x;
   info.max_work_group_size_y = max_work_group_sizes.y;
   info.max_work_group_size_z = max_work_group_sizes.z;
+
+  if (info.IsIntel()) {
+    if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
+      size_t sub_groups_count;
+      cl_int status =
+          clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
+                          nullptr, &sub_groups_count);
+      if (status == CL_SUCCESS) {
+        std::vector<size_t> sub_group_sizes(sub_groups_count);
+        status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
+                                 sizeof(size_t) * sub_groups_count,
+                                 sub_group_sizes.data(), nullptr);
+        if (status == CL_SUCCESS) {
+          for (int i = 0; i < sub_groups_count; ++i) {
+            info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
+          }
+        }
+      }
+    }
+  }
   return info;
 }
 
@@ -284,12 +304,7 @@ CLDevice& CLDevice::operator=(CLDevice&& device) {
 bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
 
 bool CLDevice::SupportsExtension(const std::string& extension) const {
-  for (const auto& ext : info_.extensions) {
-    if (ext == extension) {
-      return true;
-    }
-  }
-  return false;
+  return info_.SupportsExtension(extension);
 }
 
 bool CLDevice::SupportsTextureArray() const {
@@ -310,37 +325,10 @@ std::string CLDevice::GetPlatformVersion() const {
   return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
 }
 
-bool CLDevice::IsCL20OrHigher() const {
-  return info_.cl_version != OpenCLVersion::CL_1_0 &&
-         info_.cl_version != OpenCLVersion::CL_1_1 &&
-         info_.cl_version != OpenCLVersion::CL_1_2;
-}
+bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); }
 
 bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
-  if (IsIntel()) {
-    if (SupportsExtension("cl_intel_required_subgroup_size")) {
-      size_t sub_groups_count;
-      cl_int error =
-          clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
-                          nullptr, &sub_groups_count);
-      if (error != CL_SUCCESS) {
-        return false;
-      }
-      std::vector<size_t> sub_group_sizes(sub_groups_count);
-      error = clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
-                              sizeof(size_t) * sub_groups_count,
-                              sub_group_sizes.data(), nullptr);
-      if (error != CL_SUCCESS) {
-        return false;
-      }
-      for (int i = 0; i < sub_groups_count; ++i) {
-        if (sub_group_sizes[i] == sub_group_size) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
+  return info_.SupportsSubGroupWithSize(sub_group_size);
 }
 
 bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index fd29ebec2d7..a67ebae8ca3 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -95,6 +95,8 @@ std::string CompilerOptionToString(const CLDevice& device,
       return "-cl-opt-disable";
     case CompilerOptions::CL_2_0:
       return "-cl-std=CL2.0";
+    case CompilerOptions::CL_3_0:
+      return "-cl-std=CL3.0";
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.h b/tensorflow/lite/delegates/gpu/cl/cl_program.h
index 138b7d9fbd0..af8239ae7f5 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.h
@@ -41,7 +41,8 @@ enum class CompilerOptions {
   ADRENO_MORE_WAVES,
   POWERVR_FP16,
   CL_OPT_DISABLE,
-  CL_2_0
+  CL_2_0,
+  CL_3_0,
 };
 
 std::string CompilerOptionsToString(
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.cc b/tensorflow/lite/delegates/gpu/cl/device_info.cc
index d1ed69aa100..5d035e34617 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.cc
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.cc
@@ -253,6 +253,30 @@ bool DeviceInfo::SupportsOneLayerTextureArray() const {
   return !IsAdreno() || adreno_info.support_one_layer_texture_array;
 }
 
+bool DeviceInfo::SupportsExtension(const std::string& extension) const {
+  for (const auto& ext : extensions) {
+    if (ext == extension) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DeviceInfo::IsCL20OrHigher() const {
+  return cl_version != OpenCLVersion::CL_1_0 &&
+         cl_version != OpenCLVersion::CL_1_1 &&
+         cl_version != OpenCLVersion::CL_1_2;
+}
+
+bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const {
+  for (auto subgroup_size : supported_subgroup_sizes) {
+    if (sub_group_size == subgroup_size) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; }
 
 bool DeviceInfo::IsAdreno3xx() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.h b/tensorflow/lite/delegates/gpu/cl/device_info.h
index 7123891ecf4..abb3feb07b1 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.h
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.h
@@ -138,6 +138,10 @@ struct DeviceInfo {
   // To track bug on some Adreno. b/131099086
   bool SupportsOneLayerTextureArray() const;
 
+  bool SupportsExtension(const std::string& extension) const;
+  bool IsCL20OrHigher() const;
+  bool SupportsSubGroupWithSize(int sub_group_size) const;
+
   std::vector<std::string> extensions;
   bool supports_fp16;
   bool supports_image3d_writes;
@@ -155,6 +159,7 @@ struct DeviceInfo {
   int max_work_group_size_x;
   int max_work_group_size_y;
   int max_work_group_size_z;
+  std::vector<int> supported_subgroup_sizes;
 
   // rtn is ROUND_TO_NEAREST
   // with rtn precision is much better then with rtz (ROUND_TO_ZERO)
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index 3d5546a8ebb..785e88299a7 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -59,7 +59,7 @@ absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
   Tensor tensor;
   const BHWC shape(1, 4, 4, 4);
   RETURN_IF_ERROR(CreateTensor(
-      env->context(), env->device(), shape,
+      env->context(), shape,
       {DataType::FLOAT32, TensorStorageType::TEXTURE_ARRAY, Layout::HWC},
       &tensor));
   RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 68a8877ca59..297a5f70858 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -119,9 +120,21 @@ struct GPUResourcesWithValue {
   std::vector<std::pair<std::string, cl_mem>> custom_memories;
 };
 
+class GPUObject;
+
 class GPUObjectDescriptor {
  public:
   GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor&) = default;
+  GPUObjectDescriptor(GPUObjectDescriptor&& obj_desc)
+      : state_vars_(std::move(obj_desc.state_vars_)) {}
+  GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) {
+    if (this != &obj_desc) {
+      state_vars_ = std::move(obj_desc.state_vars_);
+    }
+    return *this;
+  }
   virtual ~GPUObjectDescriptor() = default;
 
   void SetStateVar(const std::string& key, const std::string& value) const {
@@ -141,6 +154,12 @@ class GPUObjectDescriptor {
   }
   virtual GPUResources GetGPUResources() const { return GPUResources(); }
 
+  virtual absl::Status CreateGPUObject(
+      CLContext* context, std::unique_ptr<GPUObject>* result) const {
+    return absl::OkStatus();
+  }
+  virtual void Release() {}
+
   void SetAccess(AccessType access_type) { access_type_ = access_type; }
   AccessType GetAccess() const { return access_type_; }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 7802024302b..9cb8ddee818 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -63,42 +63,25 @@ bool IsReady(const absl::flat_hash_set<ValueId>& ready_tensors,
 std::vector<std::pair<ValueId, TensorDescriptor>> GetCLNodeTensors(
     const CLNode& node) {
   std::vector<std::pair<ValueId, TensorDescriptor>> result;
-  const OperationDef main_def = node.operations[0]->GetDefinition();
-  const auto& first_range = node.ranges[0];
-  for (int k = first_range.x; k < first_range.y; ++k) {
-    result.push_back({node.inputs[k], main_def.src_tensors[k - first_range.x]});
-  }
-  for (int j = 1; j < node.ranges.size(); ++j) {
-    const auto& range = node.ranges[j];
-    const OperationDef op_def = node.operations[j]->GetDefinition();
-    for (int k = range.x; k < range.y; ++k) {
-      result.push_back({node.inputs[k], op_def.src_tensors[k - range.x + 1]});
-    }
+  result.reserve(node.inputs.size() + node.outputs.size());
+  const OperationDef op_def = node.operation->GetDefinition();
+  for (int j = 0; j < node.inputs.size(); ++j) {
+    result.push_back({node.inputs[j], op_def.src_tensors[j]});
   }
   for (int j = 0; j < node.outputs.size(); ++j) {
-    result.push_back({node.outputs[j], main_def.dst_tensors[j]});
+    result.push_back({node.outputs[j], op_def.dst_tensors[j]});
   }
 
   return result;
 }
 
-void MergeCLNodes(CLNode* src, CLNode* dst) {
-  int offset = dst->inputs.size();
+absl::Status MergeCLNodes(CLNode* src, CLNode* dst) {
   for (int j = 1; j < src->inputs.size(); ++j) {
     dst->inputs.push_back(src->inputs[j]);
   }
-  auto first_range = src->ranges[0];
-  dst->ranges.push_back(
-      int2(first_range.x + offset, first_range.y - 1 + offset));
-  for (int i = 1; i < src->ranges.size(); ++i) {
-    auto range = src->ranges[i];
-    dst->ranges.push_back(int2(range.x + offset, range.y + offset));
-  }
   dst->outputs[0] = src->outputs[0];
-  for (int i = 0; i < src->operations.size(); ++i) {
-    dst->operations.push_back(std::move(src->operations[i]));
-  }
   dst->name += " linked : " + src->name;
+  return dst->operation->AddOperation(src->operation.get());
 }
 
 void AddUsage(ValueId id, int task_index,
@@ -153,18 +136,16 @@ bool IsGenericAdd(const Node& node, const std::vector<Value*>& inputs,
 }  // namespace
 
 CLNode::CLNode(CLNode&& node)
-    : operations(std::move(node.operations)),
+    : operation(std::move(node.operation)),
       inputs(std::move(node.inputs)),
       outputs(std::move(node.outputs)),
-      ranges(std::move(node.ranges)),
       name(std::move(node.name)) {}
 
 CLNode& CLNode::operator=(CLNode&& node) {
   if (this != &node) {
-    operations = std::move(node.operations);
+    operation = std::move(node.operation);
     inputs = std::move(node.inputs);
     outputs = std::move(node.outputs);
-    ranges = std::move(node.ranges);
     name = std::move(node.name);
   }
   return *this;
@@ -179,7 +160,7 @@ absl::Status InferenceContext::InitFromGraph(
   creation_context.queue = env->queue();
   creation_context.cache = env->program_cache();
 
-  ReserveGraphTensors(create_info, creation_context, graph);
+  ReserveGraphTensors(create_info, creation_context.GetDeviceInfo(), graph);
   precision_ = create_info.precision;
   storage_type_ = create_info.storage_type;
   if (env->device().IsMali()) {
@@ -193,10 +174,10 @@ absl::Status InferenceContext::InitFromGraph(
     need_flush_ = true;
   }
   CopyInAndOutIds(graph);
-  RETURN_IF_ERROR(
-      ConvertOperations(creation_context, graph, create_info.hints));
-  Merge();
-  RETURN_IF_ERROR(AllocateMemory(env->device(), creation_context.context));
+  RETURN_IF_ERROR(ConvertOperations(creation_context.GetDeviceInfo(), graph,
+                                    create_info.hints));
+  RETURN_IF_ERROR(Merge());
+  RETURN_IF_ERROR(AllocateMemory(creation_context.context));
   BindMemoryToOperations();
   RETURN_IF_ERROR(Compile(creation_context));
   RETURN_IF_ERROR(UpdateParams());
@@ -232,8 +213,8 @@ void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
 }
 
 void InferenceContext::ReserveGraphTensors(
-    const CreateInferenceInfo& create_info,
-    const CreationContext& creation_context, const GraphFloat32& graph) {
+    const CreateInferenceInfo& create_info, const DeviceInfo& device_info,
+    const GraphFloat32& graph) {
   ValueId max_id;
   auto tensors = graph.values();
   auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
@@ -244,14 +225,14 @@ void InferenceContext::ReserveGraphTensors(
     if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
       if (shape.c < 4 &&
           CanCreateTensorWithShape(
-              creation_context.device->info_, shape,
+              device_info, shape,
               TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
                                layout})) {
         storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
       }
     }
-    storage_type = SelectBestStorageType(creation_context.device->info_, shape,
-                                         storage_type, data_type, layout);
+    storage_type = SelectBestStorageType(device_info, shape, storage_type,
+                                         data_type, layout);
     tensor_reserver_.Add(
         t->id, {shape, TensorDescriptor{data_type, storage_type, layout}});
     max_id = std::max(max_id, t->id);
@@ -259,9 +240,9 @@ void InferenceContext::ReserveGraphTensors(
   tensor_reserver_.SetNext(max_id + 1);
 }
 
-absl::Status InferenceContext::ConvertOperations(
-    const CreationContext& creation_context, const GraphFloat32& graph,
-    ModelHints hints) {
+absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
+                                                 const GraphFloat32& graph,
+                                                 ModelHints hints) {
   std::map<ValueId, TensorDescriptor> tensor_descriptors;
   const auto values = graph.values();
   for (auto value : values) {
@@ -282,7 +263,7 @@ absl::Status InferenceContext::ConvertOperations(
     }
     GPUOperationsSubgraph gpu_subgraph;
     if (hints.Check(ModelHints::kAllowSpecialKernels) &&
-        GPUSubgraphFromGraph(creation_context, precision_, graph, node.id,
+        GPUSubgraphFromGraph(device_info, precision_, graph, node.id,
                              tensor_descriptors, &consumed_nodes, &gpu_subgraph)
             .ok()) {
       // Mapping of subgraph (set of nodes) to GPU operations. Should happen
@@ -321,9 +302,8 @@ absl::Status InferenceContext::ConvertOperations(
         op_def.dst_tensors.push_back(
             tensor_reserver_.Get(outputs[j]->id).descriptor);
       }
-      RETURN_IF_ERROR(GPUOperationFromNode(creation_context, op_def, hints,
-                                           inputs, outputs, node,
-                                           &gpu_subgraph));
+      RETURN_IF_ERROR(GPUOperationFromNode(device_info, op_def, hints, inputs,
+                                           outputs, node, &gpu_subgraph));
     }
     absl::flat_hash_map<int, ValueId> mapping_to_global_ids;
     for (int j = 0; j < gpu_subgraph.new_tensors.size(); ++j) {
@@ -333,9 +313,7 @@ absl::Status InferenceContext::ConvertOperations(
     }
     for (auto& gpu_op : gpu_subgraph.operations) {
       CLNode cl_node;
-      cl_node.operations.push_back(std::move(gpu_op.operation));
-      cl_node.ranges.push_back(
-          int2(0, static_cast<int>(gpu_op.input_ids.size())));
+      cl_node.operation = std::move(gpu_op.operation);
       cl_node.inputs.resize(gpu_op.input_ids.size());
       for (int j = 0; j < gpu_op.input_ids.size(); ++j) {
         int id = gpu_op.input_ids[j];
@@ -363,7 +341,7 @@ absl::Status InferenceContext::ConvertOperations(
   return absl::OkStatus();
 }
 
-void InferenceContext::Merge() {
+absl::Status InferenceContext::Merge() {
   absl::flat_hash_set<ValueId> ready_tensors;
   for (const auto& input_id : input_ids_) {
     ready_tensors.insert(input_id);
@@ -390,27 +368,23 @@ void InferenceContext::Merge() {
       continue;
     }
     auto& linkable_node = nodes_[next_nodes[0]];
-    if (!linkable_node.operations[0]->IsLinkable() ||
+    if (!linkable_node.operation->IsLinkable() ||
         linkable_node.outputs.size() != 1 ||
         !IsReady(ready_tensors, linkable_node)) {
       continue;
     }
     const auto& original_dst_def =
-        node.operations[0]->GetDefinition().dst_tensors[0];
+        node.operation->GetDefinition().dst_tensors[0];
     const auto& link_dst_def =
-        linkable_node.operations[0]->GetDefinition().dst_tensors[0];
+        linkable_node.operation->GetDefinition().dst_tensors[0];
     if (original_dst_def != link_dst_def) {
       continue;
     }
-    MergeCLNodes(&linkable_node, &node);
+    RETURN_IF_ERROR(MergeCLNodes(&linkable_node, &node));
     nodes_.erase(nodes_.begin() + next_nodes[0]);
     i -= 1;
   }
-  for (auto& node : nodes_) {
-    for (int j = 1; j < node.operations.size(); ++j) {
-      node.operations[0]->AddOperation(node.operations[j].get());
-    }
-  }
+  return absl::OkStatus();
 }
 
 void InferenceContext::GetUsages(
@@ -438,15 +412,13 @@ void InferenceContext::GetUsages(
   }
 }
 
-absl::Status InferenceContext::AllocateMemory(const CLDevice& device,
-                                              CLContext* context) {
-  RETURN_IF_ERROR(AllocateMemoryForBuffers(device, context));
-  RETURN_IF_ERROR(AllocateMemoryForStrongShapes(device, context));
+absl::Status InferenceContext::AllocateMemory(CLContext* context) {
+  RETURN_IF_ERROR(AllocateMemoryForBuffers(context));
+  RETURN_IF_ERROR(AllocateMemoryForStrongShapes(context));
   return absl::OkStatus();
 }
 
-absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
-                                                        CLContext* context) {
+absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) {
   std::map<ValueId, int2> buffer_usages;
   GetUsages(
       [](const TensorDescriptor& t) { return IsBufferBased(t.storage_type); },
@@ -498,7 +470,7 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(const CLDevice& device,
 }
 
 absl::Status InferenceContext::AllocateMemoryForStrongShapes(
-    const CLDevice& device, CLContext* context) {
+    CLContext* context) {
   std::map<ValueId, int2> usages;
   GetUsages(
       [](const TensorDescriptor& t) { return !IsBufferBased(t.storage_type); },
@@ -526,7 +498,7 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
       graph_ids_to_strong_shape_tensors_[t.first] = id;
       const auto& it = strong_shape_tensors_.find(id);
       if (it == strong_shape_tensors_.end()) {
-        RETURN_IF_ERROR(CreateTensor(*context, device, shape, t.second,
+        RETURN_IF_ERROR(CreateTensor(*context, shape, t.second,
                                      &strong_shape_tensors_[id]));
       }
     }
@@ -536,19 +508,11 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
 
 void InferenceContext::BindMemoryToOperations() {
   for (auto& node : nodes_) {
-    const auto& first_range = node.ranges[0];
-    for (int k = first_range.x; k < first_range.y; ++k) {
-      node.operations[0]->SetSrc(GetTensor(node.inputs[k]), k - first_range.x);
+    for (int i = 0; i < node.inputs.size(); ++i) {
+      node.operation->SetSrc(GetTensor(node.inputs[i]), i);
     }
-    for (int i = 1; i < node.ranges.size(); ++i) {
-      const auto& range = node.ranges[i];
-      for (int k = range.x; k < range.y; ++k) {
-        node.operations[i]->SetSrc(GetTensor(node.inputs[k]), k - range.x + 1);
-      }
-    }
-
     for (int i = 0; i < node.outputs.size(); ++i) {
-      node.operations[0]->SetDst(GetTensor(node.outputs[i]), i);
+      node.operation->SetDst(GetTensor(node.outputs[i]), i);
     }
   }
 }
@@ -556,21 +520,21 @@ void InferenceContext::BindMemoryToOperations() {
 absl::Status InferenceContext::Compile(
     const CreationContext& creation_context) {
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->Compile(creation_context));
+    RETURN_IF_ERROR(node.operation->Compile(creation_context));
   }
   return absl::OkStatus();
 }
 
 absl::Status InferenceContext::Tune(const TuningParameters& tuning_parameters) {
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->Tune(tuning_parameters));
+    RETURN_IF_ERROR(node.operation->Tune(tuning_parameters));
   }
   return absl::OkStatus();
 }
 
 absl::Status InferenceContext::UpdateParams() {
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->UpdateParams());
+    RETURN_IF_ERROR(node.operation->UpdateParams());
   }
   return absl::OkStatus();
 }
@@ -584,7 +548,7 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   }
   int counter = 0;
   for (auto& node : nodes_) {
-    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+    RETURN_IF_ERROR(node.operation->AddToQueue(queue));
     counter++;
     if (flush_periodically_ && counter % flush_period_ == 0) {
       clFlush(queue->queue());
@@ -601,7 +565,7 @@ absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
   queue->ResetMeasurements();
   for (auto& node : nodes_) {
     queue->SetEventsLabel(node.name);
-    RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+    RETURN_IF_ERROR(node.operation->AddToQueue(queue));
   }
   RETURN_IF_ERROR(queue->WaitForCompletion());
   *result = queue->GetProfilingInfo();
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index e26cb170228..8486f2ddcd3 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -40,12 +40,9 @@ namespace gpu {
 namespace cl {
 
 struct CLNode {
-  std::vector<std::unique_ptr<GPUOperation>> operations;
+  std::unique_ptr<GPUOperation> operation;
   std::vector<ValueId> inputs;
   std::vector<ValueId> outputs;
-  // So as CLNode can have few operations, ranges keep range of ids from inputs,
-  // for every operation.
-  std::vector<int2> ranges;
 
   // Mostly for debug purposes.
   std::string name;
@@ -92,20 +89,18 @@ class InferenceContext {
 
  private:
   void CopyInAndOutIds(const GraphFloat32& graph);
-  absl::Status ConvertOperations(const CreationContext& creation_context,
+  absl::Status ConvertOperations(const DeviceInfo& device_info,
                                  const GraphFloat32& graph, ModelHints hints);
   void CreateLinks();
   void ReserveGraphTensors(const CreateInferenceInfo& create_info,
-                           const CreationContext& creation_context,
+                           const DeviceInfo& device_info,
                            const GraphFloat32& graph);
-  void Merge();
-  absl::Status AllocateMemory(const CLDevice& device, CLContext* context);
+  absl::Status Merge();
+  absl::Status AllocateMemory(CLContext* context);
 
-  absl::Status AllocateMemoryForBuffers(const CLDevice& device,
-                                        CLContext* context);
+  absl::Status AllocateMemoryForBuffers(CLContext* context);
 
-  absl::Status AllocateMemoryForStrongShapes(const CLDevice& device,
-                                             CLContext* context);
+  absl::Status AllocateMemoryForStrongShapes(CLContext* context);
 
   // utility function
   void GetUsages(const std::function<bool(const TensorDescriptor&)>& functor,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index c8351304188..02f5f9c4a4a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -118,6 +118,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
         "//tensorflow/lite/delegates/gpu/cl:util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -774,6 +775,8 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_program",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -938,6 +941,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
index f864a731446..0112241117e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -34,8 +34,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
       return absl::InvalidArgumentError(
           "Layout doesn't have Batch dimension, but shape.b != 1");
     }
-    RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                                 *creation_context.device, src_shape,
+    RETURN_IF_ERROR(CreateTensor(*creation_context.context, src_shape,
                                  op_def.src_tensors[0], &src[i]));
     RETURN_IF_ERROR(src[i].WriteData(creation_context.queue, src_cpu[i]));
     operation->SetSrc(&src[i], i);
@@ -48,8 +47,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
       return absl::InvalidArgumentError(
           "Layout doesn't have Batch dimension, but shape.b != 1");
     }
-    RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                                 *creation_context.device, dst_shape,
+    RETURN_IF_ERROR(CreateTensor(*creation_context.context, dst_shape,
                                  op_def.dst_tensors[0], &dst[i]));
 
     operation->SetDst(&dst[i], i);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
index d6889af7717..f5f019177de 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_test.cc
@@ -51,7 +51,7 @@ TEST_F(OpenCLOperationTest, ConcatWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatXY operation = CreateConcatXY(op_def, attr);
+      GPUOperation operation = CreateConcatXY(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 3, 2), &dst_tensor));
       EXPECT_THAT(
@@ -83,7 +83,7 @@ TEST_F(OpenCLOperationTest, ConcatHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatXY operation = CreateConcatXY(op_def, attr);
+      GPUOperation operation = CreateConcatXY(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -117,7 +117,7 @@ TEST_F(OpenCLOperationTest, ConcatChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatZ operation =
+      GPUOperation operation =
           CreateConcatZ(op_def, {1, 2, 3}, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src0, src1, src2}, creation_context_,
                                     &operation, BHWC(1, 2, 1, 6), &dst_tensor));
@@ -151,7 +151,7 @@ TEST_F(OpenCLOperationTest, ConcatChannelsAlignedx4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConcatZ operation =
+      GPUOperation operation =
           CreateConcatZ(op_def, {4, 4}, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src0, src1}, creation_context_, &operation,
                                     BHWC(1, 2, 1, 8), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index 7aaa587503e..fa5b933db8a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -27,28 +27,13 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-ConcatXY::ConcatXY(const OperationDef& definition, const ConcatAttributes& attr)
-    : GPUOperation(definition) {
-  code_ = GetConcatKernelCode(definition, attr);
-}
-
-ConcatXY::ConcatXY(ConcatXY&& operation) : GPUOperation(std::move(operation)) {}
-
-ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
-  if (this != &operation) {
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConcatXY::GetConcatKernelCode(const OperationDef& op_def,
-                                          const ConcatAttributes& attr) {
+namespace {
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const ConcatAttributes& attr) {
   std::vector<std::string> tensor_names(op_def.src_tensors.size());
   for (int i = 0; i < op_def.src_tensors.size(); ++i) {
     tensor_names[i] = "src_tensor_" + std::to_string(i);
-    AddSrcTensor(tensor_names[i], op_def.src_tensors[i]);
   }
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::map<Axis, std::string> axis_to_selector = {
       {Axis::WIDTH, "Width"}, {Axis::HEIGHT, "Height"},
@@ -127,17 +112,19 @@ std::string ConcatXY::GetConcatKernelCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
+}  // namespace
 
-int3 ConcatXY::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-ConcatXY CreateConcatXY(const OperationDef& definition,
-                        const ConcatAttributes& attr) {
-  return ConcatXY(definition, attr);
+GPUOperation CreateConcatXY(const OperationDef& definition,
+                            const ConcatAttributes& attr) {
+  GPUOperation op(definition);
+  for (int i = 0; i < definition.src_tensors.size(); ++i) {
+    const std::string name = "src_tensor_" + std::to_string(i);
+    op.AddSrcTensor(name, definition.src_tensors[i]);
+  }
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetConcatKernelCode(definition, attr);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
index 7732064808b..9dd3fcee52a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h
@@ -26,24 +26,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ConcatXY : public GPUOperation {
- public:
-  ConcatXY(const OperationDef& definition, const ConcatAttributes& attr);
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConcatXY(ConcatXY&& operation);
-  ConcatXY& operator=(ConcatXY&& operation);
-  ConcatXY(const ConcatXY&) = delete;
-  ConcatXY& operator=(const ConcatXY&) = delete;
-
- private:
-  std::string GetConcatKernelCode(const OperationDef& op_def,
-                                  const ConcatAttributes& attr);
-};
-
-ConcatXY CreateConcatXY(const OperationDef& definition,
-                        const ConcatAttributes& attr);
+GPUOperation CreateConcatXY(const OperationDef& definition,
+                            const ConcatAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index 067ef25a988..2c027c91a81 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -36,53 +36,12 @@ bool IsAllChannelsX4(const std::vector<int>& channels) {
   return true;
 }
 
-}  // namespace
-
-ConcatZ::ConcatZ(const OperationDef& definition,
-                 const std::vector<int>& channels,
-                 const DeviceInfo& device_info)
-    : GPUOperation(definition) {
-  code_ = GetConcatKernelCode(definition, channels);
-  if (device_info.IsPowerVR() &&
-      definition.precision == CalculationsPrecision::F32 &&
-      !IsAllChannelsX4(channels)) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-  if (device_info.IsAMD() &&
-      definition.precision != CalculationsPrecision::F32 &&
-      definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
-      !IsAllChannelsX4(channels)) {
-    // BUG, some AMD gpus crash without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-}
-
-ConcatZ::ConcatZ(ConcatZ&& kernel) : GPUOperation(std::move(kernel)) {}
-
-ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string ConcatZ::GetConcatKernelCode(const OperationDef& op_def,
-                                         const std::vector<int>& channels) {
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const std::vector<int>& channels) {
   std::vector<std::string> tensor_names(op_def.src_tensors.size());
   for (int i = 0; i < op_def.src_tensors.size(); ++i) {
     tensor_names[i] = "src_tensor_" + std::to_string(i);
-    auto src_desc = op_def.src_tensors[i];
-    if (op_def.IsBatchSupported()) {
-      src_desc.SetStateVar("BatchedWidth", "true");
-    }
-    AddSrcTensor(tensor_names[i], src_desc);
   }
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
@@ -161,17 +120,41 @@ std::string ConcatZ::GetConcatKernelCode(const OperationDef& op_def,
   return c;
 }
 
-int3 ConcatZ::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-ConcatZ CreateConcatZ(const OperationDef& definition,
-                      const std::vector<int>& channels,
-                      const DeviceInfo& device_info) {
-  return ConcatZ(definition, channels, device_info);
+GPUOperation CreateConcatZ(const OperationDef& definition,
+                           const std::vector<int>& channels,
+                           const DeviceInfo& device_info) {
+  GPUOperation op(definition);
+  for (int i = 0; i < definition.src_tensors.size(); ++i) {
+    const std::string name = "src_tensor_" + std::to_string(i);
+    auto src_desc = definition.src_tensors[i];
+    if (definition.IsBatchSupported()) {
+      src_desc.SetStateVar("BatchedWidth", "true");
+    }
+    op.AddSrcTensor(name, src_desc);
+  }
+  auto dst_desc = definition.dst_tensors[0];
+  if (definition.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddDstTensor("dst_tensor", dst_desc);
+  op.code_ = GetConcatKernelCode(definition, channels);
+  if (device_info.IsPowerVR() &&
+      definition.precision == CalculationsPrecision::F32 &&
+      !IsAllChannelsX4(channels)) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+  if (device_info.IsAMD() &&
+      definition.precision != CalculationsPrecision::F32 &&
+      definition.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
+      !IsAllChannelsX4(channels)) {
+    // BUG, some AMD gpus crash without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HToY_DToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
index f3835093e2b..b209d8f3cd2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h
@@ -29,26 +29,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ConcatZ : public GPUOperation {
- public:
-  ConcatZ(const OperationDef& definition, const std::vector<int>& channels,
-          const DeviceInfo& device_info);
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConcatZ(ConcatZ&& kernel);
-  ConcatZ& operator=(ConcatZ&& kernel);
-  ConcatZ(const ConcatZ&) = delete;
-  ConcatZ& operator=(const ConcatZ&) = delete;
-
- private:
-  std::string GetConcatKernelCode(const OperationDef& op_def,
-                                  const std::vector<int>& channels);
-};
-
-ConcatZ CreateConcatZ(const OperationDef& definition,
-                      const std::vector<int>& channels,
-                      const DeviceInfo& device_info);
+GPUOperation CreateConcatZ(const OperationDef& definition,
+                           const std::vector<int>& channels,
+                           const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
index 4b898378c2d..06664f67768 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
@@ -167,7 +167,8 @@ std::string GenerateConv(CalculationsPrecision precision,
 }  // namespace
 
 Conv3D::Conv3D(const OperationDef& definition,
-               const Convolution3DAttributes& attr, const CLDevice& device)
+               const Convolution3DAttributes& attr,
+               const DeviceInfo& device_info)
     : GPUOperation(definition),
       stride_(attr.strides.w, attr.strides.h, attr.strides.d),
       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
@@ -175,12 +176,12 @@ Conv3D::Conv3D(const OperationDef& definition,
       kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
                    attr.weights.shape.d),
       dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      conv_params_(GuessBestParams(device, definition, attr)) {
+      conv_params_(GuessBestParams(device_info, definition, attr)) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
   code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
   if (definition_.precision == CalculationsPrecision::F16 &&
-      device.IsPowerVR()) {
+      device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
 }
@@ -725,7 +726,7 @@ std::string Conv3D::GenerateConv3D(const OperationDef& op_def,
   return c;
 }
 
-Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
+Conv3D::ConvParams Conv3D::GuessBestParams(const DeviceInfo& device_info,
                                            const OperationDef& definition,
                                            int src_slices, int dst_slices,
                                            bool x_kernel_is_1,
@@ -735,7 +736,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
   conv_params.x_kernel_is_1 = x_kernel_is_1;
   conv_params.y_kernel_is_1 = y_kernel_is_1;
   conv_params.z_kernel_is_1 = z_kernel_is_1;
-  if (device.IsNvidia()) {
+  if (device_info.IsNvidia()) {
     conv_params.block_size = int4(1, 1, 1, 4);
     work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(2, 0, 1);
@@ -754,7 +755,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
     if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
-  } else if (device.IsPowerVR()) {
+  } else if (device_info.IsPowerVR()) {
     conv_params.block_size = int4(1, 1, 1, 4);
     work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(2, 0, 1);
@@ -792,13 +793,13 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
       conv_params.block_size.x = 2;
       work_group_size_ = int3(4, 8, 1);
     }
-  } else if (device.IsAdreno()) {
+  } else if (device_info.IsAdreno()) {
     conv_params.block_size = int4(2, 2, 1, 2);
     work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
-  } else if (device.IsMali()) {
+  } else if (device_info.IsMali()) {
     conv_params.block_size = int4(1, 1, 1, 4);
     work_group_size_ = int3(8, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
@@ -829,7 +830,7 @@ Conv3D::ConvParams Conv3D::GuessBestParams(const CLDevice& device,
 }
 
 Conv3D::ConvParams Conv3D::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution3DAttributes& attr) {
   const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
@@ -845,15 +846,16 @@ Conv3D::ConvParams Conv3D::GuessBestParams(
                              attr.dilations.d == 1 &&
                              attr.padding.prepended.d == 0 &&
                              attr.padding.appended.d == 0;
-  return GuessBestParams(device, definition, src_slices, dst_slices,
+  return GuessBestParams(device_info, definition, src_slices, dst_slices,
                          x_kernel_is_1, y_kernel_is_1, z_kernel_is_1);
 }
 
-absl::Status CreateConv3D(const CreationContext& creation_context,
-                          const OperationDef& definition,
-                          const Convolution3DAttributes& attr, Conv3D* result) {
-  *result = Conv3D(definition, attr, *creation_context.device);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+Conv3D CreateConv3D(const DeviceInfo& device_info,
+                    const OperationDef& definition,
+                    const Convolution3DAttributes& attr) {
+  Conv3D result(definition, attr, device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
index e53c9c8a6d0..d4a86b0ca5e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -74,35 +75,32 @@ class Conv3D : public GPUOperation {
   };
 
   Conv3D(const OperationDef& definition, const Convolution3DAttributes& attr,
-         const CLDevice& device);
+         const DeviceInfo& device_info);
 
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
                             absl::Span<T> dst);
 
-  friend absl::Status CreateConv3D(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Convolution3DAttributes& attr,
-                                   Conv3D* result);
+  friend Conv3D CreateConv3D(const DeviceInfo& device_info,
+                             const OperationDef& definition,
+                             const Convolution3DAttributes& attr);
 
   friend std::string GenerateConv3D(const OperationDef& op_def,
                                     bool stride_correction,
                                     const ConvParams& conv_params,
                                     Arguments* args);
 
-  ConvParams GuessBestParams(const CLDevice& device,
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution3DAttributes& attr);
 
-  ConvParams GuessBestParams(const CLDevice& device,
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition, int src_slices,
                              int dst_slices, bool x_kernel_is_1,
                              bool y_kernel_is_1, bool z_kernel_is_1);
@@ -118,27 +116,21 @@ class Conv3D : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                                const tflite::gpu::Tensor<Linear, T>& biases,
-                                CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
+void Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                        const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
   TensorLinearDescriptor desc;
   desc.storage_type = conv_params_.AreWeightsBuffer()
                           ? LinearStorageType::BUFFER
                           : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(biases);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                                   CLContext* context) {
+void Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
   const int block_size = conv_params_.block_size.w;
   const int dst_slices =
       AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
@@ -155,86 +147,58 @@ absl::Status Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
 
   const int float4_size = f32_weights ? 16 : 8;
 
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
-  Buffer weights_buf;
+  std::vector<uint8_t> data(float4_size * elements_count);
+
   if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (conv_params_.AreWeightsBuffer()) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (conv_params_.AreWeightsBuffer()) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   }
 
   if (conv_params_.AreWeightsBuffer()) {
     BufferDescriptor desc;
     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
-    Texture2DDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights0", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_0)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights1", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_1)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights2", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_2)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights3", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_3)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
+    int sub_size = float4_size * elements_count / 4;
+    Texture2DDescriptor desc0;
+    desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc0.size = int2(texture_width, texture_height);
+    desc0.data.resize(sub_size);
+    memcpy(desc0.data.data(), data.data(), sub_size);
+    args_.AddObject("weights0",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
 
-  return absl::OkStatus();
+    Texture2DDescriptor desc1;
+    desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc1.size = int2(texture_width, texture_height);
+    desc1.data.resize(sub_size);
+    memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
+    args_.AddObject("weights1",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
+
+    Texture2DDescriptor desc2;
+    desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc2.size = int2(texture_width, texture_height);
+    desc2.data.resize(sub_size);
+    memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
+    args_.AddObject("weights2",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
+
+    Texture2DDescriptor desc3;
+    desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc3.size = int2(texture_width, texture_height);
+    desc3.data.resize(sub_size);
+    memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
+    args_.AddObject("weights3",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
+  }
 }
 
 template <DataType S, typename T>
@@ -294,9 +258,9 @@ void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
   }
 }
 
-absl::Status CreateConv3D(const CreationContext& creation_context,
-                          const OperationDef& definition,
-                          const Convolution3DAttributes& attr, Conv3D* result);
+Conv3D CreateConv3D(const DeviceInfo& device_info,
+                    const OperationDef& definition,
+                    const Convolution3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index e75fe02df7a..7b8a81755e1 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -81,19 +81,19 @@ std::string GetComputationPart(const int3& block_size, int element_size,
   return c;
 }
 
-ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
                                         const OperationDef& definition,
                                         const BHWC& shape, int src_depth,
                                         int dst_depth) {
   ConvBuffer1x1::ConvParams conv_params;
   conv_params.element_size = 4;
   conv_params.block_size = int3(1, 1, 1);
-  if (!device.IsMali()) {
+  if (!device_info.IsMali()) {
     return conv_params;
   }
   bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
                       definition.precision != CalculationsPrecision::F32;
-  bool is_midgard = device.IsMali() && device.info_.mali_info.IsMidgard();
+  bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
   if (is_midgard) {
     if (can_use_flt8) {
       conv_params.element_size = 8;
@@ -106,7 +106,7 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
 
   int task_size = shape.w * shape.b * shape.h * dst_depth;
   int block_size = GetRecommendedBlockSizeForConv(
-      device.info_, definition.precision, task_size);
+      device_info, definition.precision, task_size);
 
   if (!can_use_flt8 && block_size > 4) {
     block_size = 4;
@@ -134,14 +134,15 @@ ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
   return conv_params;
 }
 
-ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo& device_info,
                                         const OperationDef& definition,
                                         int src_depth, int dst_depth) {
   ConvBuffer1x1::ConvParams conv_params;
   conv_params.element_size = 4;
   conv_params.block_size = int3(1, 1, 1);
-  if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
-      device.info_.compute_units_count <= 4) {
+  if (device_info.IsMali() &&
+      definition.precision == CalculationsPrecision::F16 &&
+      device_info.compute_units_count <= 4) {
     conv_params.block_size.x *= 2;
   }
   return conv_params;
@@ -345,85 +346,80 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
 }
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvBuffer1x1* result, const BHWC* shape) {
-  if (!IsConvBuffer1x1Supported(definition, attr)) {
-    return absl::InvalidArgumentError("ConvBuffer1x1 doesn't supported");
-  }
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  const BHWC* shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
-    conv_params = GetBestParams(*creation_context.device, definition, *shape,
-                                src_depth, dst_depth);
+    conv_params =
+        GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
   } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
-                                dst_depth);
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const FullyConnectedAttributes& attr,
-                                 ConvBuffer1x1* result, const BHWC* shape) {
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const FullyConnectedAttributes& attr,
+                                  const BHWC* shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
-    conv_params = GetBestParams(*creation_context.device, definition, *shape,
-                                src_depth, dst_depth);
+    conv_params =
+        GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
   } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
-                                dst_depth);
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
   conv_params.block_size.x *= conv_params.block_size.y;
   conv_params.block_size.y = 1;
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvBuffer1x1Wino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-    const BHWC* shape) {
+ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC* shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (shape) {
-    conv_params = GetBestParams(*creation_context.device, definition, *shape,
-                                src_depth, dst_depth);
+    conv_params =
+        GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
   } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
-                                dst_depth);
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
   conv_params.block_size.x *= conv_params.block_size.y;
   conv_params.block_size.y = 1;
   conv_params.different_weights_for_height = true;
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadDataForWinograd4x4To6x6(
-      attr.weights, *creation_context.device, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadDataForWinograd4x4To6x6(attr.weights);
+  return result;
 }
 
-absl::Status CreateConvBuffer1x1DynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
+ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvBuffer1x1* result, const BHWC* dst_shape) {
+    const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
   const int src_depth = DivideRoundUp(weights_shape.c, 4);
   ConvBuffer1x1::ConvParams conv_params;
   if (dst_shape) {
-    conv_params = GetBestParams(*creation_context.device, definition,
-                                *dst_shape, src_depth, dst_depth);
-  } else {
-    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
+    conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth,
                                 dst_depth);
+  } else {
+    conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
   }
-  *result = ConvBuffer1x1(definition, conv_params);
-  return result->UploadBiases(attr.bias, creation_context.context);
+  ConvBuffer1x1 result(definition, conv_params);
+  result.UploadBiases(attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index 530aec70a17..f0c75e16e94 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -72,39 +72,34 @@ class ConvBuffer1x1 : public GPUOperation {
 
  private:
   ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
-  friend absl::Status CreateConvBuffer1x1(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-      const BHWC* shape);
-  friend absl::Status CreateConvBuffer1x1(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const FullyConnectedAttributes& attr, ConvBuffer1x1* result,
-      const BHWC* shape);
-  friend absl::Status CreateConvBuffer1x1Wino4x4To6x6(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-      const BHWC* shape);
-  friend absl::Status CreateConvBuffer1x1DynamicWeights(
-      const CreationContext& creation_context, const OperationDef& definition,
+  friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                           const OperationDef& definition,
+                                           const Convolution2DAttributes& attr,
+                                           const BHWC* shape);
+  friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                           const OperationDef& definition,
+                                           const FullyConnectedAttributes& attr,
+                                           const BHWC* shape);
+  friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC* shape);
+  friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
+      const DeviceInfo& device_info, const OperationDef& definition,
       const Convolution2DAttributes& attr, const BHWC& weights_shape,
-      ConvBuffer1x1* result, const BHWC* dst_shape);
+      const BHWC* dst_shape);
 
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
   template <DataType T>
-  absl::Status UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-      CLContext* context);
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases,
-                            CLContext* context);
+  void UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases);
 
   std::string GenerateConvBuffer1x1(
       const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
@@ -114,32 +109,26 @@ class ConvBuffer1x1 : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadData(
-    const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
-  RETURN_IF_ERROR(UploadBiases(biases, context));
-  return absl::OkStatus();
+void ConvBuffer1x1::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                               const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
+  UploadBiases(biases);
 }
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-    CLContext* context) {
+void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  UploadWeights(wino_weights);
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(weights.shape.o);
   bias.data.resize(weights.shape.o, 0.0f);
-  RETURN_IF_ERROR(UploadBiases(bias, context));
-
-  return absl::OkStatus();
+  UploadBiases(bias);
 }
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvBuffer1x1::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
 
@@ -150,51 +139,36 @@ absl::Status ConvBuffer1x1::UploadWeights(
   const int elements_count =
       weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 16;
   desc.memory_type = MemoryType::GLOBAL;
+  desc.size = float4_size * elements_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+                                     absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+                                     absl::MakeSpan(ptr, elements_count));
+  }
+
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvBuffer1x1::UploadBiases(
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+void ConvBuffer1x1::UploadBiases(const tflite::gpu::Tensor<Linear, T>& biases) {
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::BUFFER;
   desc.element_type = definition_.GetDataType();
-
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias = biases;
-  int channels = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z);
-  bias.shape = Linear(channels);
-  bias.data.resize(channels, 0.0f);
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4;
+  desc.UploadLinearData(biases, depth);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
@@ -204,27 +178,24 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
                               const BHWC& weights_shape,
                               const Convolution2DAttributes& attr);
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvBuffer1x1* result,
-                                 const BHWC* shape = nullptr);
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  const BHWC* shape = nullptr);
 
-absl::Status CreateConvBuffer1x1(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const FullyConnectedAttributes& attr,
-                                 ConvBuffer1x1* result,
-                                 const BHWC* shape = nullptr);
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const FullyConnectedAttributes& attr,
+                                  const BHWC* shape = nullptr);
 
-absl::Status CreateConvBuffer1x1DynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
+ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvBuffer1x1* result, const BHWC* dst_shape = nullptr);
+    const BHWC* dst_shape = nullptr);
 
-absl::Status CreateConvBuffer1x1Wino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvBuffer1x1* result,
-    const BHWC* shape = nullptr);
+ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution2DAttributes& attr, const BHWC* shape = nullptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
index 828eafcc04f..d43329c91d9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
@@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
     op_def.dst_tensors.push_back(
         {data_type, TensorStorageType::BUFFER, Layout::HWC});
     TensorFloat32 dst_tensor;
-    ConvBuffer1x1 operation;
-    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
-                                  &src_tensor.shape));
+    ConvBuffer1x1 operation = CreateConvBuffer1x1(
+        creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
     ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                   BHWC(1, 2, 1, 2), &dst_tensor));
     EXPECT_THAT(dst_tensor.data,
@@ -92,9 +91,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
     op_def.dst_tensors.push_back(
         {data_type, TensorStorageType::BUFFER, Layout::HWC});
     TensorFloat32 dst_tensor;
-    ConvBuffer1x1 operation;
-    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
-                                  &src_tensor.shape));
+    ConvBuffer1x1 operation = CreateConvBuffer1x1(
+        creation_context_.GetDeviceInfo(), op_def, attr, &src_tensor.shape);
     ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                   BHWC(1, 2, 1, 4), &dst_tensor));
     EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index 1ed900a2080..dc54286c0fc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -255,10 +255,11 @@ int3 ConvConstants::GetGridSize() const {
   return int3(grid_x, grid_y, 1);
 }
 
-bool IsConvConstantsSupported(const CLDevice& device,
+bool IsConvConstantsSupported(const DeviceInfo& device_info,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr) {
-  if (device.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
+  if (device_info.IsAMD() &&
+      definition.precision != CalculationsPrecision::F32 &&
       definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
     // BUG, some AMD gpus crashe without it
     return false;
@@ -271,34 +272,25 @@ bool IsConvConstantsSupported(const CLDevice& device,
                              ? sizeof(float)
                              : sizeof(half);
   const int filters_buffer_size = filters_count * float_size;
-  const int kConstantMaxSize = GetOptimalMaxConstantSize(device.info_);
+  const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
   const int flt4_registers = DivideRoundUp(w_shape.o, 4);
   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }
 
-absl::Status CreateConvConstants(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvConstants* result) {
-  if (!IsConvConstantsSupported(*creation_context.device, definition, attr)) {
-    return absl::InvalidArgumentError("ConvConstants doesn't supported");
-  }
-  *result = ConvConstants(definition, attr, creation_context.device->info_);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+ConvConstants CreateConvConstants(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr) {
+  ConvConstants result(definition, attr, device_info);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::BUFFER;
   desc.element_type = definition.GetDataType();
   desc.memory_type = MemoryType::CONSTANT;
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index 6504b828158..5be433588ce 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -45,16 +45,15 @@ class ConvConstants : public GPUOperation {
   ConvConstants& operator=(const ConvConstants&) = delete;
 
  private:
-  friend absl::Status CreateConvConstants(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvConstants* result);
+  friend ConvConstants CreateConvConstants(const DeviceInfo& device_info,
+                                           const OperationDef& definition,
+                                           const Convolution2DAttributes& attr);
   ConvConstants(const OperationDef& definition,
                 const Convolution2DAttributes& attr,
                 const DeviceInfo& device_info);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -75,40 +74,32 @@ class ConvConstants : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvConstants::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvConstants::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float_size = f32_weights ? 4 : 2;
+  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
 
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
+  desc.size = float_size * float_count;
+  desc.data.resize(desc.size);
 
-  const int float_size = f32_weights ? 4 : 2;
-  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
-
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(float_count / 4);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        float_size * float_count, gpu_data.data(), context, &weights_buffer));
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
   } else {
-    std::vector<half4> gpu_data(float_count / 4);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        float_size * float_count, gpu_data.data(), context, &weights_buffer));
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
   }
 
-  args_.AddObject("weigths", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weigths",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -154,14 +145,13 @@ void ConvConstants::RearrangeWeightsData(
   }
 }
 
-bool IsConvConstantsSupported(const CLDevice& device,
+bool IsConvConstantsSupported(const DeviceInfo& device_info,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr);
 
-absl::Status CreateConvConstants(const CreationContext& creation_context,
-                                 const OperationDef& definition,
-                                 const Convolution2DAttributes& attr,
-                                 ConvConstants* result);
+ConvConstants CreateConvConstants(const DeviceInfo& device_info,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
index 015e862fa65..4aa60b8d334 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvConstants operation;
-      ASSERT_OK(
-          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      ConvConstants operation =
+          CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -91,9 +90,8 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvConstants operation;
-      ASSERT_OK(
-          CreateConvConstants(creation_context_, op_def, attr, &operation));
+      ConvConstants operation =
+          CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index eb5baa8a6ba..bd4f6d70994 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -130,33 +130,33 @@ std::string GenerateBlockCoords(const int3& block_size,
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         const CLDevice& device, const BHWC* dst_shape)
+                         const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
                       -attr.padding.prepended.h),
       kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
                        attr.dilations.w, attr.dilations.h),
-      conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
+      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         const BHWC& weights_shape, const CLDevice& device,
-                         const BHWC* dst_shape)
+                         const BHWC& weights_shape,
+                         const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
                       -attr.padding.prepended.h),
       kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
                        attr.dilations.h),
-      conv_params_(GuessBestParams(device, definition, attr, weights_shape,
+      conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
                                    dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
-                         const CLDevice& device, const BHWC* dst_shape)
+                         const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(1, 1, 0, 0),
       kernel_dilation_(1, 1, 1, 1),
-      conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
+      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
     : GPUOperation(definition),
@@ -687,8 +687,8 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition, int src_depth,
-    int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
+    const DeviceInfo& device_info, const OperationDef& definition,
+    int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
     bool different_weights_for_height, const BHWC* dst_shape) {
   ConvParams conv_params;
   conv_params.linear_hw = false;
@@ -697,7 +697,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   conv_params.x_kernel_is_1 = x_kernel_is_1;
   conv_params.y_kernel_is_1 = y_kernel_is_1;
   conv_params.different_weights_for_height = different_weights_for_height;
-  if (device.IsNvidia()) {
+  if (device_info.IsNvidia()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
@@ -721,7 +721,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
       float task_size_per_cu =
-          static_cast<float>(task_size) / device.info_.compute_units_count;
+          static_cast<float>(task_size) / device_info.compute_units_count;
       int block_size = conv_params.block_size.x * conv_params.block_size.y *
                        conv_params.block_size.z;
       float threads_per_cu = task_size_per_cu / block_size;
@@ -742,7 +742,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
-  } else if (device.IsPowerVR()) {
+  } else if (device_info.IsPowerVR()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
@@ -790,7 +790,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       }
       conv_params.block_size.x = 2;
     }
-  } else if (device.IsAMD()) {
+  } else if (device_info.IsAMD()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
       conv_params.work_group_launch_order = int3(2, 0, 1);
@@ -819,12 +819,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     if (src_depth % 2 == 0 && src_depth >= 16) {
       conv_params.src_depth_loop_size = 2;
     }
-  } else if (device.IsMali()) {
+  } else if (device_info.IsMali()) {
     int block_size = 2;
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
       block_size = GetRecommendedBlockSizeForConv(
-          device.info_, definition.precision, task_size);
+          device_info, definition.precision, task_size);
     }
     if (!x_kernel_is_1 || !y_kernel_is_1) {
       block_size = std::min(block_size, 4);
@@ -847,7 +847,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       conv_params.block_size = int3(1, 1, 1);
     }
     conv_params.src_depth_loop_size = 1;
-    MaliInfo mali_info = device.info_.mali_info;
+    MaliInfo mali_info = device_info.mali_info;
     if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
       conv_params.src_depth_loop_size = 2;
     }
@@ -859,14 +859,14 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-  } else if (device.IsAdreno()) {
+  } else if (device_info.IsAdreno()) {
     conv_params.block_size = int3(2, 2, 1);
     work_group_size_ = int3(8, 2, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-  } else if (device.IsIntel()) {
+  } else if (device_info.IsIntel()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(16, 1, 1);
       conv_params.work_group_launch_order = int3(0, 1, 2);
@@ -880,9 +880,10 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     conv_params.block_size = int3(1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     if (definition.precision != CalculationsPrecision::F32_F16 &&
-        device.SupportsExtension("cl_khr_subgroups") &&
-        device.SupportsExtension("cl_intel_required_subgroup_size") &&
-        device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
+        device_info.SupportsExtension("cl_khr_subgroups") &&
+        device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
+        device_info.IsCL20OrHigher() &&
+        device_info.SupportsSubGroupWithSize(16)) {
       conv_params.weights_upload_type =
           WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
     } else {
@@ -927,7 +928,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
@@ -939,12 +940,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
                              attr.dilations.h == 1 &&
                              attr.padding.prepended.h == 0 &&
                              attr.padding.appended.h == 0;
-  return GuessBestParams(device, definition, src_depth, dst_depth,
+  return GuessBestParams(device_info, definition, src_depth, dst_depth,
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
     const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(weights_shape.b, 4);
@@ -955,17 +956,18 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   const bool y_kernel_is_1 =
       weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
       attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
-  return GuessBestParams(device, definition, src_depth, dst_depth,
+  return GuessBestParams(device_info, definition, src_depth, dst_depth,
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
-    const CLDevice& device, const OperationDef& definition,
+    const DeviceInfo& device_info, const OperationDef& definition,
     const FullyConnectedAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  ConvPowerVR::ConvParams params = GuessBestParams(
-      device, definition, src_depth, dst_depth, true, true, false, dst_shape);
+  ConvPowerVR::ConvParams params =
+      GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
+                      false, dst_shape);
   work_group_size_.x *= work_group_size_.y;
   work_group_size_.y = 1;
   params.block_size.x *= params.block_size.y;
@@ -974,55 +976,59 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
-    const CLDevice& device, const OperationDef& definition,
+    const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC* dst_shape) {
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
-  ConvPowerVR::ConvParams params = GuessBestParams(
-      device, definition, src_depth, dst_depth, true, true, true, dst_shape);
+  ConvPowerVR::ConvParams params =
+      GuessBestParams(device_info, definition, src_depth, dst_depth, true, true,
+                      true, dst_shape);
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
 }
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvPowerVR* result, const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr,
+                              const BHWC* dst_shape) {
+  ConvPowerVR result(definition, attr, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvPowerVR* result, const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr,
+                              const BHWC* dst_shape) {
+  ConvPowerVR result(definition, attr, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvPowerVRDynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvPowerVR* result, const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition, attr, weights_shape,
-                        *creation_context.device, dst_shape);
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadBias(attr.bias, creation_context.context);
+ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
+                                            const OperationDef& definition,
+                                            const Convolution2DAttributes& attr,
+                                            const BHWC& weights_shape,
+                                            const BHWC* dst_shape) {
+  ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadBias(attr.bias);
+  return result;
 }
 
-absl::Status CreateConvPowerVRWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvPowerVR* result,
-    const BHWC* dst_shape) {
-  *result = ConvPowerVR(definition);
-  result->conv_params_ = result->GuessBestParamsWinograd(
-      *creation_context.device, definition, attr, dst_shape);
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadDataForWinograd4x4To6x6(
-      attr.weights, *creation_context.device, creation_context.context);
+ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr,
+                                          const BHWC* dst_shape) {
+  ConvPowerVR result(definition);
+  result.conv_params_ =
+      result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadDataForWinograd4x4To6x6(attr.weights);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index 1ff6db43cbc..bceb25044f7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -128,75 +128,68 @@ class ConvPowerVR : public GPUOperation {
   };
 
   ConvPowerVR(const OperationDef& definition,
-              const Convolution2DAttributes& attr, const CLDevice& device,
-              const BHWC* dst_shape = nullptr);
+              const Convolution2DAttributes& attr,
+              const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   ConvPowerVR(const OperationDef& definition,
               const Convolution2DAttributes& attr, const BHWC& weights_shape,
-              const CLDevice& device, const BHWC* dst_shape = nullptr);
+              const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   ConvPowerVR(const OperationDef& definition,
-              const FullyConnectedAttributes& attr, const CLDevice& device,
-              const BHWC* dst_shape = nullptr);
+              const FullyConnectedAttributes& attr,
+              const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   explicit ConvPowerVR(const OperationDef& definition);
 
   void GenerateCode(const DeviceInfo& device_info);
 
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
   template <DataType T>
-  absl::Status UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-      CLContext* context);
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
-                          CLContext* context);
+  void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
 
-  friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const Convolution2DAttributes& attr,
-                                        ConvPowerVR* result,
-                                        const BHWC* dst_shape);
+  friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr,
+                                       const BHWC* dst_shape);
 
-  friend absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const FullyConnectedAttributes& attr,
-                                        ConvPowerVR* result,
-                                        const BHWC* dst_shape);
+  friend ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const FullyConnectedAttributes& attr,
+                                       const BHWC* dst_shape);
 
-  friend absl::Status CreateConvPowerVRDynamicWeights(
-      const CreationContext& creation_context, const OperationDef& definition,
+  friend ConvPowerVR CreateConvPowerVRDynamicWeights(
+      const DeviceInfo& device_info, const OperationDef& definition,
       const Convolution2DAttributes& attr, const BHWC& weights_shape,
-      ConvPowerVR* result, const BHWC* dst_shape);
-
-  friend absl::Status CreateConvPowerVRWino4x4To6x6(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvPowerVR* result,
       const BHWC* dst_shape);
 
-  ConvParams GuessBestParams(const CLDevice& device,
+  friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr, const BHWC* dst_shape);
+
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
                              const BHWC* dst_shape = nullptr);
-  ConvParams GuessBestParams(const CLDevice& device,
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
                              const BHWC& weights_shape,
                              const BHWC* dst_shape = nullptr);
-  ConvParams GuessBestParams(const CLDevice& device,
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const FullyConnectedAttributes& attr,
                              const BHWC* dst_shape = nullptr);
-  ConvParams GuessBestParamsWinograd(const CLDevice& device,
+  ConvParams GuessBestParamsWinograd(const DeviceInfo& device_info,
                                      const OperationDef& definition,
                                      const Convolution2DAttributes& attr,
                                      const BHWC* dst_shape = nullptr);
-  ConvParams GuessBestParams(const CLDevice& device,
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition, int src_depth,
                              int dst_depth, bool x_kernel_is_1,
                              bool y_kernel_is_1,
@@ -213,31 +206,26 @@ class ConvPowerVR : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadData(
-    const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
-  RETURN_IF_ERROR(UploadBias(biases, context));
-  return absl::OkStatus();
+void ConvPowerVR::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                             const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
+  UploadBias(biases);
 }
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-    CLContext* context) {
+void ConvPowerVR::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  UploadWeights(wino_weights);
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
   biases.shape = Linear(weights.shape.o);
   biases.data.resize(weights.shape.o, 0.0f);
-  RETURN_IF_ERROR(UploadBias(biases, context));
-  return absl::OkStatus();
+  UploadBias(biases);
 }
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
-                                     CLContext* context) {
+void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
   BufferDescriptor desc;
   desc.element_type = conv_params_.weights_data_type;
   desc.element_size = 4;
@@ -245,36 +233,29 @@ absl::Status ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias,
                              ConvPowerVR::WeightsUploadType::CONSTANT_MEM
                          ? MemoryType::CONSTANT
                          : MemoryType::GLOBAL;
-
-  Buffer bias_buffer;
+  const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
+                             ? sizeof(float)
+                             : sizeof(half);
   int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
+  desc.size = float_size * aligned_channels;
+  desc.data.resize(desc.size);
   if (conv_params_.weights_data_type == DataType::FLOAT32) {
-    std::vector<float> gpu_data(aligned_channels);
-    for (int i = 0; i < gpu_data.size(); ++i) {
+    float* gpu_data = reinterpret_cast<float*>(desc.data.data());
+    for (int i = 0; i < aligned_channels; ++i) {
       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &bias_buffer));
   } else {
-    std::vector<half> gpu_data(aligned_channels);
-    for (int i = 0; i < gpu_data.size(); ++i) {
+    half* gpu_data = reinterpret_cast<half*>(desc.data.data());
+    for (int i = 0; i < aligned_channels; ++i) {
       gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &bias_buffer));
   }
-
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(bias_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  args_.AddObject("biases",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvPowerVR::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
 
@@ -285,23 +266,6 @@ absl::Status ConvPowerVR::UploadWeights(
   const int elements_count =
       weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = conv_params_.weights_data_type;
   desc.element_size = 4;
@@ -309,34 +273,42 @@ absl::Status ConvPowerVR::UploadWeights(
                              ConvPowerVR::WeightsUploadType::CONSTANT_MEM
                          ? MemoryType::CONSTANT
                          : MemoryType::GLOBAL;
+  desc.size = float4_size * elements_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+                                     absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+                                     absl::MakeSpan(ptr, elements_count));
+  }
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvPowerVR* result,
-                               const BHWC* dst_shape = nullptr);
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr,
+                              const BHWC* dst_shape = nullptr);
 
-absl::Status CreateConvPowerVR(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvPowerVR* result,
-                               const BHWC* dst_shape = nullptr);
+ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr,
+                              const BHWC* dst_shape = nullptr);
 
-absl::Status CreateConvPowerVRDynamicWeights(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    ConvPowerVR* result, const BHWC* dst_shape = nullptr);
+ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo& device_info,
+                                            const OperationDef& definition,
+                                            const Convolution2DAttributes& attr,
+                                            const BHWC& weights_shape,
+                                            const BHWC* dst_shape = nullptr);
 
-absl::Status CreateConvPowerVRWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvPowerVR* result,
-    const BHWC* dst_shape = nullptr);
+ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr,
+                                          const BHWC* dst_shape = nullptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
index b63a1dbc830..e93df4bcb26 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr_test.cc
@@ -57,8 +57,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1SimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -92,8 +92,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR1x1) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -127,8 +127,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVRSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -162,8 +162,8 @@ TEST_F(OpenCLOperationTest, ConvPowerVR) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvPowerVR operation;
-      ASSERT_OK(CreateConvPowerVR(creation_context_, op_def, attr, &operation));
+      ConvPowerVR operation =
+          CreateConvPowerVR(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 7f987cc724c..bff328772d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -427,33 +427,33 @@ void ConvTexture::GetPossibleKernelWorkGroups(
                             work_groups);
 }
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvTexture* result) {
-  *result = ConvTexture(definition, attr);
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr) {
+  ConvTexture result(definition, attr);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvTexture* result) {
-  *result = ConvTexture(definition);
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadData(attr.weights, attr.bias, creation_context.context);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr) {
+  ConvTexture result(definition);
+  result.GenerateCode(device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
-absl::Status CreateConvTextureWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvTexture* result) {
-  *result = ConvTexture(definition);
-  result->different_weights_for_height_ = true;
-  result->block_size_ = {4, 1, 2};
-  result->GenerateCode(creation_context.device->info_);
-  return result->UploadDataForWinograd4x4To6x6(
-      attr.weights, *creation_context.device, creation_context.context);
+ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr) {
+  ConvTexture result(definition);
+  result.different_weights_for_height_ = true;
+  result.block_size_ = {4, 1, 2};
+  result.GenerateCode(device_info);
+  result.UploadDataForWinograd4x4To6x6(attr.weights);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 8406918fe80..3ebd43bf32b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -56,35 +56,30 @@ class ConvTexture : public GPUOperation {
   ConvTexture& operator=(const ConvTexture&) = delete;
 
  private:
-  friend absl::Status CreateConvTexture(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const Convolution2DAttributes& attr,
-                                        ConvTexture* result);
-  friend absl::Status CreateConvTexture(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const FullyConnectedAttributes& attr,
-                                        ConvTexture* result);
+  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const Convolution2DAttributes& attr);
+  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                                       const OperationDef& definition,
+                                       const FullyConnectedAttributes& attr);
 
-  friend absl::Status CreateConvTextureWino4x4To6x6(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvTexture* result);
+  friend ConvTexture CreateConvTextureWino4x4To6x6(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const Convolution2DAttributes& attr);
 
   ConvTexture(const OperationDef& definition,
               const Convolution2DAttributes& attr);
   explicit ConvTexture(const OperationDef& definition);
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
 
   template <DataType T>
-  absl::Status UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-      CLContext* context);
+  void UploadDataForWinograd4x4To6x6(
+      const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -113,30 +108,24 @@ class ConvTexture : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvTexture::UploadData(
-    const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
-  RETURN_IF_ERROR(UploadWeights(weights, context));
+void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                             const tflite::gpu::Tensor<Linear, T>& biases) {
+  UploadWeights(weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(biases);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
-    CLContext* context) {
+void ConvTexture::UploadDataForWinograd4x4To6x6(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   tflite::gpu::Tensor<OHWI, T> wino_weights;
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+  UploadWeights(wino_weights);
 
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(1);
@@ -144,18 +133,13 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
-  args_.AddObject("biases", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(bias);
+  args_.AddObject("biases",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 template <DataType T>
-absl::Status ConvTexture::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
   int dst_depth = DivideRoundUp(weights.shape.o, 4);
   dst_depth = AlignByN(dst_depth, block_size_.z);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
@@ -169,70 +153,56 @@ absl::Status ConvTexture::UploadWeights(
   DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
 
   const int elements_count = texture_width * texture_height;
+  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  Texture2DDescriptor desc;
-  desc.element_type = data_type;
+  Texture2DDescriptor desc0;
+  desc0.element_type = data_type;
+  desc0.size = int2(texture_width, texture_height);
+  desc0.data.resize(elements_count * float4_size);
 
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
+  Texture2DDescriptor desc1;
+  desc1.element_type = data_type;
+  desc1.size = int2(texture_width, texture_height);
+  desc1.data.resize(elements_count * float4_size);
+
+  Texture2DDescriptor desc2;
+  desc2.element_type = data_type;
+  desc2.size = int2(texture_width, texture_height);
+  desc2.data.resize(elements_count * float4_size);
+
+  Texture2DDescriptor desc3;
+  desc3.element_type = data_type;
+  desc3.size = int2(texture_width, texture_height);
+  desc3.data.resize(elements_count * float4_size);
 
   if (f32_weights) {
-    std::vector<float4> gpu_data_0(elements_count);
-    std::vector<float4> gpu_data_1(elements_count);
-    std::vector<float4> gpu_data_2(elements_count);
-    std::vector<float4> gpu_data_3(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
-                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
-                         absl::MakeSpan(gpu_data_3));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_0.data(),
-                                        context, &weights_0));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_1.data(),
-                                        context, &weights_1));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_2.data(),
-                                        context, &weights_2));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_3.data(),
-                                        context, &weights_3));
+    float4* ptr0 = reinterpret_cast<float4*>(desc0.data.data());
+    float4* ptr1 = reinterpret_cast<float4*>(desc1.data.data());
+    float4* ptr2 = reinterpret_cast<float4*>(desc2.data.data());
+    float4* ptr3 = reinterpret_cast<float4*>(desc3.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
+                         absl::MakeSpan(ptr1, elements_count),
+                         absl::MakeSpan(ptr2, elements_count),
+                         absl::MakeSpan(ptr3, elements_count));
   } else {
-    std::vector<half4> gpu_data_0(elements_count);
-    std::vector<half4> gpu_data_1(elements_count);
-    std::vector<half4> gpu_data_2(elements_count);
-    std::vector<half4> gpu_data_3(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data_0),
-                         absl::MakeSpan(gpu_data_1), absl::MakeSpan(gpu_data_2),
-                         absl::MakeSpan(gpu_data_3));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_0.data(),
-                                        context, &weights_0));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_1.data(),
-                                        context, &weights_1));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_2.data(),
-                                        context, &weights_2));
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
-                                        texture_height, gpu_data_3.data(),
-                                        context, &weights_3));
+    half4* ptr0 = reinterpret_cast<half4*>(desc0.data.data());
+    half4* ptr1 = reinterpret_cast<half4*>(desc1.data.data());
+    half4* ptr2 = reinterpret_cast<half4*>(desc2.data.data());
+    half4* ptr3 = reinterpret_cast<half4*>(desc3.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
+                         absl::MakeSpan(ptr1, elements_count),
+                         absl::MakeSpan(ptr2, elements_count),
+                         absl::MakeSpan(ptr3, elements_count));
   }
 
-  args_.AddObject("weights0", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_0)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  args_.AddObject("weights1", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_1)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  args_.AddObject("weights2", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_2)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  args_.AddObject("weights3", AccessType::READ,
-                  absl::make_unique<Texture2D>(std::move(weights_3)),
-                  absl::make_unique<Texture2DDescriptor>(desc));
-  return absl::OkStatus();
+  args_.AddObject("weights0",
+                  absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
+  args_.AddObject("weights1",
+                  absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
+  args_.AddObject("weights2",
+                  absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
+  args_.AddObject("weights3",
+                  absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
 }
 
 template <DataType S, typename T>
@@ -280,19 +250,17 @@ void ConvTexture::RearrangeWeightsData(
   }
 }
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const Convolution2DAttributes& attr,
-                               ConvTexture* result);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const Convolution2DAttributes& attr);
 
-absl::Status CreateConvTexture(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const FullyConnectedAttributes& attr,
-                               ConvTexture* result);
+ConvTexture CreateConvTexture(const DeviceInfo& device_info,
+                              const OperationDef& definition,
+                              const FullyConnectedAttributes& attr);
 
-absl::Status CreateConvTextureWino4x4To6x6(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const Convolution2DAttributes& attr, ConvTexture* result);
+ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
+                                          const OperationDef& definition,
+                                          const Convolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
index 6b78d0a4078..2a92573b689 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
@@ -55,8 +55,8 @@ TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvTexture operation;
-      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ConvTexture operation =
+          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -90,8 +90,8 @@ TEST_F(OpenCLOperationTest, ConvTexture) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvTexture operation;
-      ASSERT_OK(CreateConvTexture(creation_context_, op_def, attr, &operation));
+      ConvTexture operation =
+          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 314d0b20499..5dbb191cc52 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -168,7 +168,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
        "args.dst_tensor.Height() || dst_z >= "
        "args.dst_tensor.Slices()) return;\n";
   if (weights_are_buffer) {
-    c += "  int f_base = dst_z * args.src_tensor.Slice() * args.kernel_size_x "
+    c += "  int f_base = dst_z * args.src_tensor.Slices() * args.kernel_size_x "
          "* args.kernel_size_y;\n";
   }
   for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
@@ -358,27 +358,20 @@ void ConvolutionTransposed::GetPossibleKernelWorkGroups(
                             work_groups);
 }
 
-absl::Status CreateConvolutionTransposed(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed* result) {
-  *result =
-      ConvolutionTransposed(definition, attr, creation_context.device->info_);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+ConvolutionTransposed CreateConvolutionTransposed(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposed result(definition, attr, device_info);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 9f865f8f0b7..d8ecacaf85a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -52,16 +52,14 @@ class ConvolutionTransposed : public GPUOperation {
   ConvolutionTransposed& operator=(const ConvolutionTransposed&) = delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposed(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed* result);
+  friend ConvolutionTransposed CreateConvolutionTransposed(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   explicit ConvolutionTransposed(const OperationDef& definition,
                                  const ConvolutionTransposedAttributes& attr,
                                  const DeviceInfo& device_info);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -82,101 +80,70 @@ class ConvolutionTransposed : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvolutionTransposed::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int dst_depth =
       AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = kernel_size_.x;
   const int kernel_y = kernel_size_.y;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y;
 
   const int elements_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
   const int float4_size = f32_weights ? 16 : 8;
+  std::vector<uint8_t> data(float4_size * elements_count);
 
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
-  Buffer weights_buf;
   if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), dst_depth, src_depth * kernel_x * kernel_y,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   }
 
   if (weights_are_buffer_) {
     BufferDescriptor desc;
     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
-    Texture2DDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights0", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_0)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights1", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_1)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights2", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_2)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights3", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_3)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
+    int sub_size = float4_size * elements_count / 4;
+    Texture2DDescriptor desc0;
+    desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc0.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
+    desc0.data.resize(sub_size);
+    memcpy(desc0.data.data(), data.data(), sub_size);
+    args_.AddObject("weights0",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
 
-  return absl::OkStatus();
+    Texture2DDescriptor desc1;
+    desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc1.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
+    desc1.data.resize(sub_size);
+    memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
+    args_.AddObject("weights1",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
+
+    Texture2DDescriptor desc2;
+    desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc2.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
+    desc2.data.resize(sub_size);
+    memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
+    args_.AddObject("weights2",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
+
+    Texture2DDescriptor desc3;
+    desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc3.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
+    desc3.data.resize(sub_size);
+    memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
+    args_.AddObject("weights3",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
+  }
 }
 
 template <DataType S, typename T>
@@ -231,9 +198,9 @@ void ConvolutionTransposed::RearrangeWeightsData(
   }
 }
 
-absl::Status CreateConvolutionTransposed(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr, ConvolutionTransposed* result);
+ConvolutionTransposed CreateConvolutionTransposed(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
index 2b35080b1ab..443a62105cc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
@@ -401,27 +401,20 @@ void ConvolutionTransposed3D::GetPossibleKernelWorkGroups(
                             work_groups);
 }
 
-absl::Status CreateConvolutionTransposed3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr,
-    ConvolutionTransposed3D* result) {
-  *result =
-      ConvolutionTransposed3D(definition, attr, creation_context.device->info_);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+ConvolutionTransposed3D CreateConvolutionTransposed3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr) {
+  ConvolutionTransposed3D result(definition, attr, device_info);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type =
       DeduceLinearStorageType(definition.GetPrimaryStorageType());
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
index 919181bceab..3285dfcb685 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
@@ -52,16 +52,14 @@ class ConvolutionTransposed3D : public GPUOperation {
   ConvolutionTransposed3D& operator=(const ConvolutionTransposed3D&) = delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposed3D(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposed3DAttributes& attr,
-      ConvolutionTransposed3D* result);
+  friend ConvolutionTransposed3D CreateConvolutionTransposed3D(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposed3DAttributes& attr);
   ConvolutionTransposed3D(const OperationDef& definition,
                           const ConvolutionTransposed3DAttributes& attr,
                           const DeviceInfo& device_info);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
@@ -81,8 +79,8 @@ class ConvolutionTransposed3D : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3D::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
+void ConvolutionTransposed3D::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights) {
   const int dst_depth =
       AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
@@ -97,87 +95,58 @@ absl::Status ConvolutionTransposed3D::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
   const int float4_size = f32_weights ? 16 : 8;
+  std::vector<uint8_t> data(float4_size * elements_count);
 
-  Texture2D weights_0;
-  Texture2D weights_1;
-  Texture2D weights_2;
-  Texture2D weights_3;
-  Buffer weights_buf;
   if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_0));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height, context,
-          &weights_1));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 2, context,
-          &weights_2));
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data() + texture_width * texture_height * 3, context,
-          &weights_3));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   }
 
   if (weights_are_buffer_) {
     BufferDescriptor desc;
     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
-    Texture2DDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights0", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_0)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights1", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_1)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights2", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_2)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-    args_.AddObject("weights3", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_3)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
-  }
+    int sub_size = float4_size * elements_count / 4;
+    Texture2DDescriptor desc0;
+    desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc0.size = int2(texture_width, texture_height);
+    desc0.data.resize(sub_size);
+    memcpy(desc0.data.data(), data.data(), sub_size);
+    args_.AddObject("weights0",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
 
-  return absl::OkStatus();
+    Texture2DDescriptor desc1;
+    desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc1.size = int2(texture_width, texture_height);
+    desc1.data.resize(sub_size);
+    memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
+    args_.AddObject("weights1",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
+
+    Texture2DDescriptor desc2;
+    desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc2.size = int2(texture_width, texture_height);
+    desc2.data.resize(sub_size);
+    memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
+    args_.AddObject("weights2",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
+
+    Texture2DDescriptor desc3;
+    desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc3.size = int2(texture_width, texture_height);
+    desc3.data.resize(sub_size);
+    memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
+    args_.AddObject("weights3",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
+  }
 }
 
 template <DataType S, typename T>
@@ -236,10 +205,9 @@ void ConvolutionTransposed3D::RearrangeWeightsData(
   }
 }
 
-absl::Status CreateConvolutionTransposed3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr,
-    ConvolutionTransposed3D* result);
+ConvolutionTransposed3D CreateConvolutionTransposed3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 3e3a5a1f7f4..af952dd3f78 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -28,16 +28,16 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
-    const OperationDef& definition, const CLDevice& device, int2 padding)
+    const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
     : GPUOperation(definition),
       padding_(padding),
       work_group_launch_order_(2, 0, 1) {
   work_group_size_ = int3(8, 4, 1);
-  if (device.IsPowerVR()) {
+  if (device_info.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia() || device.IsIntel()) {
+  } else if (device_info.IsNvidia() || device_info.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-  } else if (device.IsAMD()) {
+  } else if (device_info.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
   } else {
     weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
@@ -45,7 +45,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
   code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_,
                                             padding_, work_group_launch_order_);
   if (definition_.precision == CalculationsPrecision::F16 &&
-      device.IsPowerVR()) {
+      device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
 }
@@ -329,38 +329,26 @@ int3 ConvolutionTransposed3x3::GetGridSize() const {
 }
 
 bool IsConvolutionTransposed3x3Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
          attr.stride.w == 2 && attr.stride.h == 2;
 }
 
-absl::Status CreateConvolutionTransposed3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3* result) {
-  if (!IsConvolutionTransposed3x3Supported(*creation_context.device, definition,
-                                           attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposed3x3 doesn't support this attributes");
-  }
+ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
   const int2 padding = int2(attr.padding.prepended.w, attr.padding.prepended.h);
-  *result =
-      ConvolutionTransposed3x3(definition, *creation_context.device, padding);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+  ConvolutionTransposed3x3 result(definition, device_info, padding);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index 0f4022b6eb6..ad3e459da3e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -61,14 +61,12 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 
  private:
   ConvolutionTransposed3x3(const OperationDef& definition,
-                           const CLDevice& device, int2 padding);
-  friend absl::Status CreateConvolutionTransposed3x3(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed3x3* result);
+                           const DeviceInfo& device_info, int2 padding);
+  friend ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -85,8 +83,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3x3::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvolutionTransposed3x3::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
@@ -96,19 +94,6 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
@@ -117,12 +102,19 @@ absl::Status ConvolutionTransposed3x3::UploadWeights(
               ConvolutionTransposed3x3::WeightsUploadType::CONSTANT_MEM
           ? MemoryType::CONSTANT
           : MemoryType::GLOBAL;
+  desc.size = flt4_size * flt4_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  }
 
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -181,13 +173,12 @@ void ConvolutionTransposed3x3::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposed3x3Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposed3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3* result);
+ConvolutionTransposed3x3 CreateConvolutionTransposed3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc
index 3f72d7479fe..8fbf6b05b43 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_test.cc
@@ -54,9 +54,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed3x3 operation;
-      ASSERT_OK(CreateConvolutionTransposed3x3(creation_context_, op_def, attr,
-                                               &operation));
+      ConvolutionTransposed3x3 operation = CreateConvolutionTransposed3x3(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index 4fb93dd3263..19b9a2143e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -189,7 +189,7 @@ int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
 }
 
 bool IsConvolutionTransposed3x3ThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+    const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
          attr.weights.shape.h == 3 && attr.stride.w == 2 &&
          attr.stride.h == 2 && attr.padding.prepended.w == 1 &&
@@ -197,19 +197,12 @@ bool IsConvolutionTransposed3x3ThinSupported(
          attr.padding.appended.h == 1;
 }
 
-absl::Status CreateConvolutionTransposed3x3Thin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3Thin* result) {
-  if (!IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
-                                               attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposed3x3Thin doesn't support this attributes");
-  }
-  *result = ConvolutionTransposed3x3Thin(definition, attr);
-  RETURN_IF_ERROR(
-      result->UploadData(attr.weights, attr.bias, creation_context.context));
-  return absl::OkStatus();
+ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposed3x3Thin result(definition, attr);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index 5b4c4d05bac..5905f6f6404 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -48,17 +48,15 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
       delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposed3x3Thin(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed3x3Thin* result);
+  friend ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   explicit ConvolutionTransposed3x3Thin(
       const OperationDef& definition,
       const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -69,9 +67,9 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3x3Thin::UploadData(
+void ConvolutionTransposed3x3Thin::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+    const tflite::gpu::Tensor<Linear, T>& biases) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
@@ -79,48 +77,41 @@ absl::Status ConvolutionTransposed3x3Thin::UploadData(
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
+  desc.size = flt4_size * (flt4_count + dst_depth);
+  desc.data.resize(desc.size);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     for (int i = 0; i < dst_depth; ++i) {
       float4 bias_value(0.0f);
       for (int c = 0; c < 4; ++c) {
         int ch = i * 4 + c;
         bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
       }
-      gpu_data.push_back(bias_value);
+      gpu_data[flt4_count + i] = bias_value;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
   } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     for (int i = 0; i < dst_depth; ++i) {
       half4 bias_value(0.0f);
       for (int c = 0; c < 4; ++c) {
         int ch = i * 4 + c;
         bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
       }
-      gpu_data.push_back(bias_value);
+      gpu_data[flt4_count + i] = bias_value;
     }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
   }
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -166,12 +157,11 @@ void ConvolutionTransposed3x3Thin::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposed3x3ThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+    const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposed3x3Thin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed3x3Thin* result);
+ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
index 82d4492866d..83df267a884 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
@@ -54,9 +54,9 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed3x3Thin operation;
-      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
-                                                   attr, &operation));
+      ConvolutionTransposed3x3Thin operation =
+          CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
+                                             op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -90,9 +90,9 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3Thin) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed3x3Thin operation;
-      ASSERT_OK(CreateConvolutionTransposed3x3Thin(creation_context_, op_def,
-                                                   attr, &operation));
+      ConvolutionTransposed3x3Thin operation =
+          CreateConvolutionTransposed3x3Thin(creation_context_.GetDeviceInfo(),
+                                             op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 4ecb23c318c..2f6010b6c46 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -28,14 +28,14 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 ConvolutionTransposed4x4::ConvolutionTransposed4x4(
-    const OperationDef& definition, const CLDevice& device)
+    const OperationDef& definition, const DeviceInfo& device_info)
     : GPUOperation(definition) {
   work_group_size_ = int3(8, 4, 1);
-  if (device.IsPowerVR()) {
+  if (device_info.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia() || device.IsIntel()) {
+  } else if (device_info.IsNvidia() || device_info.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-  } else if (device.IsAMD()) {
+  } else if (device_info.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
   } else {
     weights_upload_type_ = WeightsUploadType::GLOBAL_MEM;
@@ -43,7 +43,7 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
 
   code_ = GenerateConvolutionTransposedCode(definition_, weights_upload_type_);
   if (definition_.precision == CalculationsPrecision::F16 &&
-      device.IsPowerVR()) {
+      device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
 }
@@ -307,37 +307,26 @@ int3 ConvolutionTransposed4x4::GetGridSize() const {
 }
 
 bool IsConvolutionTransposed4x4Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
          attr.stride.w == 2 && attr.stride.h == 2 &&
          attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
 }
 
-absl::Status CreateConvolutionTransposed4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed4x4* result) {
-  if (!IsConvolutionTransposed4x4Supported(*creation_context.device, definition,
-                                           attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposed4x4 doesn't support this attributes");
-  }
-  *result = ConvolutionTransposed4x4(definition, *creation_context.device);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposed4x4 result(definition, device_info);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 6344ca39bc0..dd1084bb267 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -61,14 +61,12 @@ class ConvolutionTransposed4x4 : public GPUOperation {
 
  private:
   ConvolutionTransposed4x4(const OperationDef& definition,
-                           const CLDevice& device);
-  friend absl::Status CreateConvolutionTransposed4x4(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposed4x4* result);
+                           const DeviceInfo& device_info);
+  friend ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -81,8 +79,8 @@ class ConvolutionTransposed4x4 : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed4x4::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void ConvolutionTransposed4x4::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int kernel_x = 4;  //  This operation support only 4x4 kernel
@@ -92,19 +90,6 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
   const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  Buffer weights_buffer;
-  if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(
-        flt4_size * flt4_count, gpu_data.data(), context, &weights_buffer));
-  }
-
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
@@ -113,12 +98,19 @@ absl::Status ConvolutionTransposed4x4::UploadWeights(
               ConvolutionTransposed4x4::WeightsUploadType::CONSTANT_MEM
           ? MemoryType::CONSTANT
           : MemoryType::GLOBAL;
+  desc.size = flt4_size * flt4_count;
+  desc.data.resize(desc.size);
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, flt4_count));
+  }
 
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -164,13 +156,12 @@ void ConvolutionTransposed4x4::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposed4x4Supported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposed4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposed4x4* result);
+ConvolutionTransposed4x4 CreateConvolutionTransposed4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc
index 97ee0b5702f..a65479d72b8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed4x4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed4x4 operation;
-      ASSERT_OK(CreateConvolutionTransposed4x4(creation_context_, op_def, attr,
-                                               &operation));
+      ConvolutionTransposed4x4 operation = CreateConvolutionTransposed4x4(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
index dca405c2c7f..1da989d111d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed operation;
-      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
-                                            &operation));
+      ConvolutionTransposed operation = CreateConvolutionTransposed(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 2), &dst_tensor));
       EXPECT_THAT(
@@ -94,9 +93,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposed operation;
-      ASSERT_OK(CreateConvolutionTransposed(creation_context_, op_def, attr,
-                                            &operation));
+      ConvolutionTransposed operation = CreateConvolutionTransposed(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
index 54fd5396869..8781eadd867 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.cc
@@ -159,26 +159,19 @@ int3 ConvolutionTransposedThin::GetGridSize() const {
 }
 
 bool IsConvolutionTransposedThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr) {
+    const ConvolutionTransposedAttributes& attr) {
   return attr.weights.shape.o <= 4 && attr.weights.shape.w == attr.stride.w &&
          attr.weights.shape.h == attr.stride.h &&
          attr.padding.prepended.w == 0 && attr.padding.prepended.h == 0 &&
          attr.padding.appended.w == 0 && attr.padding.appended.h == 0;
 }
 
-absl::Status CreateConvolutionTransposedThin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposedThin* result) {
-  if (!IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
-    return absl::InvalidArgumentError(
-        "ConvolutionTransposedThin doesn't support this attributes");
-  }
-  *result = ConvolutionTransposedThin(definition, attr,
-                                      creation_context.device->info_);
-  RETURN_IF_ERROR(
-      result->UploadData(attr.weights, attr.bias, creation_context.context));
-  return absl::OkStatus();
+ConvolutionTransposedThin CreateConvolutionTransposedThin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr) {
+  ConvolutionTransposedThin result(definition, attr, device_info);
+  result.UploadData(attr.weights, attr.bias);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
index 817887ab7af..7599ad23fde 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin.h
@@ -47,17 +47,15 @@ class ConvolutionTransposedThin : public GPUOperation {
       delete;
 
  private:
-  friend absl::Status CreateConvolutionTransposedThin(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const ConvolutionTransposedAttributes& attr,
-      ConvolutionTransposedThin* result);
+  friend ConvolutionTransposedThin CreateConvolutionTransposedThin(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposedAttributes& attr);
   ConvolutionTransposedThin(const OperationDef& definition,
                             const ConvolutionTransposedAttributes& attr,
                             const DeviceInfo& device_info);
   template <DataType T>
-  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                          const tflite::gpu::Tensor<Linear, T>& biases,
-                          CLContext* context);
+  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                  const tflite::gpu::Tensor<Linear, T>& biases);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -68,50 +66,43 @@ class ConvolutionTransposedThin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposedThin::UploadData(
+void ConvolutionTransposedThin::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+    const tflite::gpu::Tensor<Linear, T>& biases) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int flt4_count =
       weights.shape.w * weights.shape.h * src_depth * weights.shape.o;
 
   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int flt4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
+  desc.size = flt4_size * (flt4_count + 1);
+  desc.data.resize(desc.size);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    float4* gpu_data = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     float4 bias_value(0.0f);
     for (int i = 0; i < weights.shape.o; ++i) {
       bias_value[i] = biases.data[i];
     }
-    gpu_data.push_back(bias_value);
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    gpu_data[flt4_count] = bias_value;
   } else {
-    std::vector<half4> gpu_data(flt4_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    half4* gpu_data = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data, flt4_count));
     half4 bias_value(0.0f);
     for (int i = 0; i < weights.shape.o; ++i) {
       bias_value[i] = biases.data[i];
     }
-    gpu_data.push_back(bias_value);
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    gpu_data[flt4_count] = bias_value;
   }
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 template <DataType S, typename T>
@@ -147,12 +138,11 @@ void ConvolutionTransposedThin::RearrangeWeightsData(
 }
 
 bool IsConvolutionTransposedThinSupported(
-    const CLDevice& device, const ConvolutionTransposedAttributes& attr);
+    const ConvolutionTransposedAttributes& attr);
 
-absl::Status CreateConvolutionTransposedThin(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const ConvolutionTransposedAttributes& attr,
-    ConvolutionTransposedThin* result);
+ConvolutionTransposedThin CreateConvolutionTransposedThin(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
index 36fdf9f2fe9..16968008e24 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_thin_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThinSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposedThin operation;
-      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
-                                                &operation));
+      ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 2), &dst_tensor));
       EXPECT_THAT(
@@ -94,9 +93,8 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposedThin) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvolutionTransposedThin operation;
-      ASSERT_OK(CreateConvolutionTransposedThin(creation_context_, op_def, attr,
-                                                &operation));
+      ConvolutionTransposedThin operation = CreateConvolutionTransposedThin(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 4b4416751fb..91e26b27cdf 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -306,50 +306,38 @@ int3 DepthwiseConvolution::GetGridSize() const {
   return int3(grid_x, grid_y, grid_z);
 }
 
-absl::Status CreateDepthwiseConvolution(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr,
-    DepthwiseConvolution* result) {
-  bool weights_are_buffer = creation_context.device->IsMali();
-  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+DepthwiseConvolution CreateDepthwiseConvolution(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr) {
+  bool weights_are_buffer = device_info.IsMali();
+  DepthwiseConvolution result(definition, attr, weights_are_buffer);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                          : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
-absl::Status CreateDepthwiseConvolution(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr,
-    DepthwiseConvolution* result) {
-  bool weights_are_buffer = creation_context.device->IsMali();
-  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+DepthwiseConvolution CreateDepthwiseConvolution(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr) {
+  bool weights_are_buffer = device_info.IsMali();
+  DepthwiseConvolution result(definition, attr, weights_are_buffer);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                          : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 9a841db82ab..afa6375eb83 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -48,14 +48,12 @@ class DepthwiseConvolution : public GPUOperation {
   DepthwiseConvolution& operator=(const DepthwiseConvolution&) = delete;
 
  private:
-  friend absl::Status CreateDepthwiseConvolution(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& attr,
-      DepthwiseConvolution* result);
-  friend absl::Status CreateDepthwiseConvolution(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution3DAttributes& attr,
-      DepthwiseConvolution* result);
+  friend DepthwiseConvolution CreateDepthwiseConvolution(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr);
+  friend DepthwiseConvolution CreateDepthwiseConvolution(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const DepthwiseConvolution3DAttributes& attr);
   DepthwiseConvolution(const OperationDef& definition,
                        const DepthwiseConvolution2DAttributes& attr,
                        bool weights_are_buffer);
@@ -64,16 +62,14 @@ class DepthwiseConvolution : public GPUOperation {
                        bool weights_are_buffer);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
@@ -94,8 +90,8 @@ class DepthwiseConvolution : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void DepthwiseConvolution::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
@@ -106,50 +102,30 @@ absl::Status DepthwiseConvolution::UploadWeights(
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
-  Texture2D weights_tex2d;
-  Buffer weights_buf;
+  std::vector<uint8_t> data(float4_size * elements_count);
+
   if (fp32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   }
 
   if (weights_are_buffer_) {
     BufferDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
   } else {
     Texture2DDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
+    desc.size = int2(kernel_x * kernel_y, dst_slices);
+    desc.data = std::move(data);
+    args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc));
   }
-
-  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
@@ -182,8 +158,8 @@ void DepthwiseConvolution::RearrangeWeightsData(
 }
 
 template <DataType T>
-absl::Status DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
+void DepthwiseConvolution::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
@@ -195,50 +171,32 @@ absl::Status DepthwiseConvolution::UploadWeights(
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
-  Texture2D weights_tex2d;
-  Buffer weights_buf;
+  std::vector<uint8_t> data(float4_size * elements_count);
+
   if (fp32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
   }
 
   if (weights_are_buffer_) {
     BufferDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
     Texture2DDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
+    desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
   }
-
-  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
@@ -273,9 +231,13 @@ void DepthwiseConvolution::RearrangeWeightsData(
   }
 }
 
-absl::Status CreateDepthwiseConvolution(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, DepthwiseConvolution* result);
+DepthwiseConvolution CreateDepthwiseConvolution(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
+DepthwiseConvolution CreateDepthwiseConvolution(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index bb1b409482f..01f2e4f9a31 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -32,10 +32,9 @@ DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
                                    bool local_mem_uploads,
                                    const DeviceInfo& device_info)
     : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
       local_mem_uploads_(local_mem_uploads) {
   work_group_size_ = int3(8, 4, 1);
-  code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer_,
+  code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer,
                                     local_mem_uploads_);
 
   if (definition_.precision == CalculationsPrecision::F16 &&
@@ -46,12 +45,10 @@ DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef& definition,
 
 DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
     : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
       local_mem_uploads_(operation.local_mem_uploads_) {}
 
 DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
   if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(local_mem_uploads_, operation.local_mem_uploads_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -289,11 +286,6 @@ std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(
   return c;
 }
 
-absl::Status DepthwiseConv3x3::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  return args_.SetObjectRef("dst_tensor", dst_[0]);
-}
-
 int3 DepthwiseConv3x3::GetGridSize() const {
   const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
   const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
@@ -321,21 +313,15 @@ bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr) {
          attr.padding.appended.h == 1;
 }
 
-absl::Status CreateDepthwiseConv3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result) {
-  if (!IsDepthwiseConv3x3Supported(attr)) {
-    return absl::InvalidArgumentError(
-        "DepthwiseConv3x3 doesn't support this attributes");
-  }
-  bool weights_are_buffer =
-      creation_context.device->IsPowerVR() || creation_context.device->IsMali();
-  bool local_mem_uploads =
-      weights_are_buffer && creation_context.device->IsPowerVR();
-  *result = DepthwiseConv3x3(definition, weights_are_buffer, local_mem_uploads,
-                             creation_context.device->info_);
-  return result->UploadWeightsAndBiases(attr.weights, attr.bias,
-                                        creation_context.context);
+DepthwiseConv3x3 CreateDepthwiseConv3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr) {
+  bool weights_are_buffer = device_info.IsPowerVR() || device_info.IsMali();
+  bool local_mem_uploads = weights_are_buffer && device_info.IsPowerVR();
+  DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads,
+                          device_info);
+  result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index b324b039f2b..bbe759fe5d4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -42,7 +42,6 @@ class DepthwiseConv3x3 : public GPUOperation {
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
       std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -56,13 +55,13 @@ class DepthwiseConv3x3 : public GPUOperation {
                             bool weights_are_buffer, bool local_mem_uploads,
                             const DeviceInfo& device_info);
   template <DataType T>
-  absl::Status UploadWeightsAndBiases(
-      const tflite::gpu::Tensor<OHWI, T>& weights,
-      const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context);
+  void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              const tflite::gpu::Tensor<Linear, T>& biases,
+                              bool weights_are_buffer);
 
-  friend absl::Status CreateDepthwiseConv3x3(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result);
+  friend DepthwiseConv3x3 CreateDepthwiseConv3x3(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const DepthwiseConvolution2DAttributes& attr);
 
   template <DataType S, typename T>
   void RearrangeWeightsAndBiasesData(
@@ -73,14 +72,13 @@ class DepthwiseConv3x3 : public GPUOperation {
                                         bool weights_are_buffer,
                                         bool local_mem_uploads);
 
-  bool weights_are_buffer_;
   bool local_mem_uploads_;
 };
 
 template <DataType T>
-absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
+void DepthwiseConv3x3::UploadWeightsAndBiases(
     const tflite::gpu::Tensor<OHWI, T>& weights,
-    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
+    const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   int texture_width = 10;  // 3x3 kernel + 1 bias
   int texture_height = src_depth;
@@ -88,50 +86,33 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
-  Texture2D weights_tex2d;
-  Buffer weights_buf;
+  std::vector<uint8_t> data(float4_size * elements_count);
   if (fp32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d));
-    }
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsAndBiasesData(weights, biases,
+                                  absl::MakeSpan(ptr, elements_count));
   }
 
-  if (weights_are_buffer_) {
+  if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Buffer>(std::move(weights_buf)),
-                    absl::make_unique<BufferDescriptor>(desc));
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
     Texture2DDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    args_.AddObject("weights", AccessType::READ,
-                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
-                    absl::make_unique<Texture2DDescriptor>(desc));
+    desc.size = int2(texture_width, texture_height);
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
   }
-
-  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
@@ -169,9 +150,9 @@ void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(
 
 bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes& attr);
 
-absl::Status CreateDepthwiseConv3x3(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, DepthwiseConv3x3* result);
+DepthwiseConv3x3 CreateDepthwiseConv3x3(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
index a88b05bb8b3..24f9e5c1f08 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3_test.cc
@@ -56,9 +56,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3SimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConv3x3 operation;
-      ASSERT_OK(
-          CreateDepthwiseConv3x3(creation_context_, op_def, attr, &operation));
+      DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -93,9 +92,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConv3x3) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConv3x3 operation;
-      ASSERT_OK(
-          CreateDepthwiseConv3x3(creation_context_, op_def, attr, &operation));
+      DepthwiseConv3x3 operation = CreateDepthwiseConv3x3(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
index ac010e7d572..5c3e596a2e5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
@@ -55,9 +55,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation;
-      ASSERT_OK(CreateDepthwiseConvolution(creation_context_, op_def, attr,
-                                           &operation));
+      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -91,9 +90,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation;
-      ASSERT_OK(CreateDepthwiseConvolution(creation_context_, op_def, attr,
-                                           &operation));
+      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -128,9 +126,8 @@ TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation;
-      ASSERT_OK(CreateDepthwiseConvolution(creation_context_, op_def, attr,
-                                           &operation));
+      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+          creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 4), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index 7d46ae4a109..afec0ab8a56 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -59,7 +59,7 @@ std::string GetOneInputCode(const OperationType& op_type,
       result = "$0 = log($0);\n";
       break;
     case OperationType::RSQRT:
-      result = "$0 = (FLT4)(1.0f) / sqrt($0);\n";
+      result = "$0 = rsqrt($0);\n";
       break;
     case OperationType::SIGMOID:
       if (precision != CalculationsPrecision::F32) {
@@ -160,80 +160,68 @@ GPUOperation CreateElementwiseOneRuntimeOneScalar(
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant linear tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateElementwiseTwoInput(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
-    bool swap_inputs, GPUOperation* result) {
+    bool swap_inputs) {
   const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
   TensorStorageType storage_type = SelectBestStorageType(
-      creation_context.device->info_, shape, definition.GetPrimaryStorageType(),
+      device_info, shape, definition.GetPrimaryStorageType(),
       definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                               *creation_context.device, shape, desc,
-                               &gpu_tensor));
-  RETURN_IF_ERROR(
-      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  desc.UploadData(constant_tensor);
 
-  *result = GPUOperation(definition);
-  result->elementwise_ = true;
-  result->args_.AddObject("second_tensor", AccessType::READ,
-                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
-                          absl::make_unique<TensorDescriptor>(desc));
+  GPUOperation result(definition);
+  result.elementwise_ = true;
+  result.args_.AddObject("second_tensor",
+                         absl::make_unique<TensorDescriptor>(std::move(desc)));
   const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
-  result->code_ = absl::StrCat(
+  result.code_ = absl::StrCat(
       "FLT4 second_val = args.second_tensor.Read(0, 0, ", s_coord, ");\n");
   if (shape.c == 1) {
-    result->code_ += "  second_val.y = second_val.x;\n";
-    result->code_ += "  second_val.z = second_val.x;\n";
-    result->code_ += "  second_val.w = second_val.x;\n";
+    result.code_ += "  second_val.y = second_val.x;\n";
+    result.code_ += "  second_val.z = second_val.x;\n";
+    result.code_ += "  second_val.w = second_val.x;\n";
   }
-  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
-                                   "second_val", swap_inputs);
-  return absl::OkStatus();
+  result.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                  "second_val", swap_inputs);
+  return result;
 }
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant HWC tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwiseTwoInput(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateElementwiseTwoInput(
+    const DeviceInfo& device_info, const OperationDef& definition,
     const OperationType& op_type,
     const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
-    bool swap_inputs, GPUOperation* result) {
+    bool swap_inputs) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
   TensorStorageType storage_type = SelectBestStorageType(
-      creation_context.device->info_, shape, definition.GetPrimaryStorageType(),
+      device_info, shape, definition.GetPrimaryStorageType(),
       definition.GetDataType(), Layout::HWC);
   TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
-  Tensor gpu_tensor;
-  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
-                               *creation_context.device, shape, desc,
-                               &gpu_tensor));
-  RETURN_IF_ERROR(
-      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  desc.UploadData(constant_tensor);
 
-  *result = GPUOperation(definition);
-  result->elementwise_ = true;
-  result->args_.AddObject("second_tensor", AccessType::READ,
-                          absl::make_unique<Tensor>(std::move(gpu_tensor)),
-                          absl::make_unique<TensorDescriptor>(desc));
+  GPUOperation result(definition);
+  result.elementwise_ = true;
+  result.args_.AddObject("second_tensor",
+                         absl::make_unique<TensorDescriptor>(std::move(desc)));
   const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
   const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
   const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
-  result->code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(",
-                               x_coord, ", ", y_coord, ", ", s_coord, ");\n");
+  result.code_ = absl::StrCat("FLT4 second_val = args.second_tensor.Read(",
+                              x_coord, ", ", y_coord, ", ", s_coord, ");\n");
   if (shape.c == 1) {
-    result->code_ += "  second_val.y = second_val.x;\n";
-    result->code_ += "  second_val.z = second_val.x;\n";
-    result->code_ += "  second_val.w = second_val.x;\n";
+    result.code_ += "  second_val.y = second_val.x;\n";
+    result.code_ += "  second_val.z = second_val.x;\n";
+    result.code_ += "  second_val.w = second_val.x;\n";
   }
-  result->code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
-                                   "second_val", swap_inputs);
+  result.code_ += GetTwoInputCode(op_type, "in_out_value", "in_out_value",
+                                  "second_val", swap_inputs);
 
-  return absl::OkStatus();
+  return result;
 }
 
 }  // namespace
@@ -246,11 +234,10 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
   return op;
 }
 
-absl::Status CreateElementwise(const CreationContext& creation_context,
+GPUOperation CreateElementwise(const DeviceInfo& device_info,
                                const OperationDef& definition,
                                const OperationType& op_type,
-                               const ElementwiseAttributes& attr,
-                               GPUOperation* result) {
+                               const ElementwiseAttributes& attr) {
   const float* scalar = absl::get_if<float>(&attr.param);
   const auto* linear_tensor =
       absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
@@ -258,20 +245,19 @@ absl::Status CreateElementwise(const CreationContext& creation_context,
       absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.param);
 
   if (scalar) {
-    *result = CreateElementwiseOneRuntimeOneScalar(
-        definition, op_type, *scalar, attr.runtime_tensor_is_second);
-    return absl::OkStatus();
+    return CreateElementwiseOneRuntimeOneScalar(definition, op_type, *scalar,
+                                                attr.runtime_tensor_is_second);
   } else if (linear_tensor) {
-    return CreateElementwiseTwoInput(creation_context, definition, op_type,
+    return CreateElementwiseTwoInput(device_info, definition, op_type,
                                      *linear_tensor,
-                                     attr.runtime_tensor_is_second, result);
+                                     attr.runtime_tensor_is_second);
   } else if (hwc_tensor) {
-    return CreateElementwiseTwoInput(creation_context, definition, op_type,
-                                     *hwc_tensor, attr.runtime_tensor_is_second,
-                                     result);
+    return CreateElementwiseTwoInput(device_info, definition, op_type,
+                                     *hwc_tensor,
+                                     attr.runtime_tensor_is_second);
+  } else {
+    return GPUOperation(definition);
   }
-  return absl::UnimplementedError(
-      "No elementwise implementation for this case");
 }
 
 GPUOperation CreateElementwiseTwoInput(const OperationDef& definition,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index f841cdba9fb..c16899071d6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -33,11 +33,10 @@ GPUOperation CreateElementwiseOneInput(const OperationDef& definition,
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant or linear/hwc tensor) operation, for example sub, div and etc.
-absl::Status CreateElementwise(const CreationContext& creation_context,
+GPUOperation CreateElementwise(const DeviceInfo& device_info,
                                const OperationDef& definition,
                                const OperationType& op_type,
-                               const ElementwiseAttributes& attr,
-                               GPUOperation* result);
+                               const ElementwiseAttributes& attr);
 
 // Creates simple two input(2 runtime tensors) operation, for example
 // sub, div and etc.
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 23ee6622e8c..d883a734214 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -546,9 +546,9 @@ TEST_F(OpenCLOperationTest, MaximumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -577,9 +577,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -608,9 +608,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -638,9 +638,9 @@ TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MAXIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MAXIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -694,9 +694,9 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def,
-                                  OperationType::MINIMUM, attr, &operation));
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::MINIMUM, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -807,9 +807,8 @@ TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreateElementwise(creation_context_, op_def, OperationType::SUB,
-                                  attr, &operation));
+      GPUOperation operation = CreateElementwise(
+          creation_context_.GetDeviceInfo(), op_def, OperationType::SUB, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index ec18fa9f6e2..999344384aa 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -110,26 +110,20 @@ int3 FullyConnected::GetGridSize() const {
   return int3(dst_[0]->Slices(), 1, 1);
 }
 
-absl::Status CreateFullyConnected(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const FullyConnectedAttributes& attr,
-                                  FullyConnected* result) {
-  *result = FullyConnected(definition, creation_context.device->info_);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
+FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
+                                    const OperationDef& definition,
+                                    const FullyConnectedAttributes& attr) {
+  FullyConnected result(definition, device_info);
+  result.UploadWeights(attr.weights);
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-
-  return absl::OkStatus();
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index 8543c3defc0..f1fc7dc199f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -105,21 +105,20 @@ class FullyConnected : public GPUOperation {
 
  private:
   FullyConnected(const OperationDef& definition, const DeviceInfo& device_info);
-  friend absl::Status CreateFullyConnected(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const FullyConnectedAttributes& attr, FullyConnected* result);
+  friend FullyConnected CreateFullyConnected(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const FullyConnectedAttributes& attr);
 
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
   std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
                                           const int3& work_group_size);
 };
 
 template <DataType T>
-absl::Status FullyConnected::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+void FullyConnected::UploadWeights(
+    const tflite::gpu::Tensor<OHWI, T>& weights) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
 
@@ -131,33 +130,24 @@ absl::Status FullyConnected::UploadWeights(
   BufferDescriptor desc;
   desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 16;
+  desc.size = float4_size * elements_count;
+  desc.data.resize(desc.size);
 
-  Buffer weights_buffer;
   if (f32_weights) {
-    std::vector<float4> gpu_data(dst_depth * src_depth * 4);
-    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
   } else {
-    std::vector<half4> gpu_data(dst_depth * src_depth * 4);
-    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                         gpu_data.data(), context,
-                                         &weights_buffer));
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
   }
 
-  args_.AddObject("weights", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(weights_buffer)),
-                  absl::make_unique<BufferDescriptor>(desc));
-
-  return absl::OkStatus();
+  args_.AddObject("weights",
+                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
-absl::Status CreateFullyConnected(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const FullyConnectedAttributes& attr,
-                                  FullyConnected* result);
+FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
+                                    const OperationDef& definition,
+                                    const FullyConnectedAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index 900b244ceb2..f58487c1941 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -52,9 +52,8 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      FullyConnected operation;
-      ASSERT_OK(
-          CreateFullyConnected(creation_context_, op_def, attr, &operation));
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 29f6c038f77..f9d6ec762ec 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -49,20 +49,6 @@ std::string GetElementWiseCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status MergeOperations(const std::vector<GPUOperation*>& linked_ops,
-                             Arguments* merged_args, std::string* merged_code) {
-  for (int i = 0; i < linked_ops.size(); ++i) {
-    std::string code = linked_ops[i]->code_;
-    std::string unique_postfix = absl::StrCat("_link", i + 1);
-    linked_ops[i]->args_.RenameArgs(unique_postfix, &code);
-    *merged_code += "{\n" + code + "\n}\n";
-    RETURN_IF_ERROR(
-        merged_args->Merge(std::move(linked_ops[i]->args_), unique_postfix));
-    linked_ops[i]->AddUniquePostfix(unique_postfix);
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace
 
 DataType OperationDef::GetDataType() const {
@@ -76,20 +62,6 @@ TensorStorageType OperationDef::GetPrimaryStorageType() const {
   return src_tensors[0].storage_type;
 }
 
-bool OperationDef::HasAllTensorsOfType(TensorStorageType storage_type) const {
-  for (const auto& src : src_tensors) {
-    if (src.storage_type != storage_type) {
-      return false;
-    }
-  }
-  for (const auto& dst : dst_tensors) {
-    if (dst.storage_type != storage_type) {
-      return false;
-    }
-  }
-  return true;
-}
-
 bool OperationDef::IsBatchSupported() const {
   for (const auto& src : src_tensors) {
     if (HasAxis(src.layout, Axis::BATCH)) {
@@ -124,6 +96,9 @@ void GPUOperation::SetDst(Tensor* ptr, int index) {
 GPUOperation::GPUOperation(GPUOperation&& operation)
     : args_(std::move(operation.args_)),
       code_(std::move(operation.code_)),
+      work_group_size_(operation.work_group_size_),
+      compiler_options_(std::move(operation.compiler_options_)),
+      tensor_to_grid_(operation.tensor_to_grid_),
       elementwise_(operation.elementwise_),
       linkable_(operation.linkable_),
       check_src_channels_size_(operation.check_src_channels_size_),
@@ -131,17 +106,19 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
       kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_),
       grid_size_(operation.grid_size_),
       src_tensors_names_(std::move(operation.src_tensors_names_)),
       dst_tensors_names_(std::move(operation.dst_tensors_names_)),
-      compiler_options_(std::move(operation.compiler_options_)),
-      linked_operations_(std::move(operation.linked_operations_)) {}
+      linkable_count_(operation.linkable_count_),
+      elementwise_code_(std::move(operation.elementwise_code_)) {}
 
 GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
   if (this != &operation) {
     args_ = std::move(operation.args_);
     code_ = std::move(operation.code_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    compiler_options_ = std::move(operation.compiler_options_);
+    tensor_to_grid_ = operation.tensor_to_grid_;
     elementwise_ = operation.elementwise_;
     linkable_ = operation.linkable_;
     check_src_channels_size_ = operation.check_src_channels_size_;
@@ -149,18 +126,33 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
     kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
     std::swap(grid_size_, operation.grid_size_);
     src_tensors_names_ = std::move(operation.src_tensors_names_);
     dst_tensors_names_ = std::move(operation.dst_tensors_names_);
-    compiler_options_ = std::move(operation.compiler_options_);
-    linked_operations_ = std::move(operation.linked_operations_);
+    std::swap(linkable_count_, operation.linkable_count_);
+    elementwise_code_ = std::move(operation.elementwise_code_);
   }
   return *this;
 }
 
-void GPUOperation::AddOperation(GPUOperation* operation) {
-  linked_operations_.push_back(operation);
+absl::Status GPUOperation::AddOperation(GPUOperation* operation) {
+  linkable_count_ += 1;
+  std::string code = operation->code_;
+  std::string unique_postfix = absl::StrCat("_link", linkable_count_);
+  operation->args_.RenameArgs(unique_postfix, &code);
+  elementwise_code_ += "{\n" + code + "\n}\n";
+  RETURN_IF_ERROR(args_.Merge(std::move(operation->args_), unique_postfix));
+  for (int i = 0; i < operation->src_tensors_names_.size(); ++i) {
+    definition_.src_tensors.push_back(
+        operation->definition_.src_tensors[i + 1]);
+    src_tensors_names_.push_back(operation->src_tensors_names_[i] +
+                                 unique_postfix);
+  }
+  for (int i = 0; i < operation->dst_tensors_names_.size(); ++i) {
+    dst_tensors_names_.push_back(operation->dst_tensors_names_[i] +
+                                 unique_postfix);
+  }
+  return absl::OkStatus();
 }
 
 void GPUOperation::AddSrcTensor(const std::string& tensor_name,
@@ -191,12 +183,6 @@ absl::Status GPUOperation::UpdateParams() {
   for (int i = 0; i < dst_tensors_names_.size(); ++i) {
     RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
   }
-  for (const auto linked_op : linked_operations_) {
-    for (int i = 0; i < linked_op->src_tensors_names_.size(); ++i) {
-      RETURN_IF_ERROR(args_.SetObjectRef(linked_op->src_tensors_names_[i],
-                                         linked_op->src_[i + 1]));
-    }
-  }
   RETURN_IF_ERROR(BindArguments());
   grid_size_ = GetGridSize();
   return absl::OkStatus();
@@ -222,24 +208,20 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
 
     std::string code =
         GetElementWiseCode(definition_, check_src_channels_size_);
-    std::string element_wise_code;
-    element_wise_code += "{\n" + code_ + "\n}\n";
-    RETURN_IF_ERROR(
-        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
+    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
     RETURN_IF_ERROR(args_.TransformToCLCode(
         creation_context.device->info_,
-        {{dst_tensors_names_[0], element_wise_code}}, &code));
+        {{dst_tensors_names_[0], elementwise_code_}}, &code));
     code = absl::Substitute(code, args_.GetListOfArgs());
     RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
         code, "main_function", *creation_context.context,
         *creation_context.device, &kernel_));
   } else {
-    std::string element_wise_code;
-    RETURN_IF_ERROR(
-        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
     RETURN_IF_ERROR(args_.TransformToCLCode(
         creation_context.device->info_,
-        {{dst_tensors_names_[0], element_wise_code}}, &code_));
+        {{dst_tensors_names_[0], elementwise_code_}}, &code_));
     RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
         code_, "main_function", compiler_options_, *creation_context.context,
         *creation_context.device, &kernel_));
@@ -277,14 +259,31 @@ absl::Status GPUOperation::Tune(const TuningParameters& params) {
 }
 
 int3 GPUOperation::GetGridSize() const {
-  if (elementwise_) {
+  if (elementwise_ || tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_SToZ) {
     const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-    const int grid_y = dst_[0]->Height();
+    const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
     const int grid_z = dst_[0]->Slices();
     return int3(grid_x, grid_y, grid_z);
-  } else {
-    return int3(0, 0, 0);
   }
+  if (tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_ZIs1) {
+    const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+    const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
+    const int grid_z = 1;
+    return int3(grid_x, grid_y, grid_z);
+  }
+  if (tensor_to_grid_ == TensorToGrid::kWBToX_HToY_DToZ) {
+    const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+    const int grid_y = dst_[0]->Height();
+    const int grid_z = dst_[0]->Depth();
+    return int3(grid_x, grid_y, grid_z);
+  }
+  if (tensor_to_grid_ == TensorToGrid::kBToX_YIs1_ZIs1) {
+    const int grid_x = dst_[0]->Batch();
+    const int grid_y = 1;
+    const int grid_z = 1;
+    return int3(grid_x, grid_y, grid_z);
+  }
+  return int3(0, 0, 0);
 }
 
 void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 80f2eb3c950..2fa8c90c1da 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -37,11 +37,39 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+// kCustom: default value
+//   GPUOperation::GetGridSize must be overloaded
+// kWBToX_HDToY_SToZ:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height() * dst_[0]->Depth();
+//   grid_z = dst_[0]->Slices();
+// kWBToX_HDToY_ZIs1:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height() * dst_[0]->Depth();
+//   grid_z = 1;
+// kWBToX_HToY_DToZ:
+//   grid_x = dst_[0]->Width() * dst_[0]->Batch();
+//   grid_y = dst_[0]->Height();
+//   grid_z = dst_[0]->Depth();
+// kBToX_YIs1_ZIs1:
+//   grid_x = dst_[0]->Batch();
+//   grid_y = 1;
+//   grid_z = 1;
+enum class TensorToGrid {
+  kCustom,
+  kWBToX_HDToY_SToZ,
+  kWBToX_HDToY_ZIs1,
+  kWBToX_HToY_DToZ,
+  kBToX_YIs1_ZIs1
+};
+
 struct CreationContext {
   const CLDevice* device;
   CLContext* context;
   CLCommandQueue* queue;
   ProgramCache* cache;
+
+  const DeviceInfo& GetDeviceInfo() const { return device->info_; }
 };
 
 struct OperationDef {
@@ -55,7 +83,6 @@ struct OperationDef {
   // the structure of kernel, all other resources(biases) types and etc.
   DataType GetPrimaryDataType() const;
   TensorStorageType GetPrimaryStorageType() const;
-  bool HasAllTensorsOfType(TensorStorageType storage_type) const;
   bool IsBatchSupported() const;
 };
 
@@ -80,7 +107,7 @@ class GPUOperation {
   GPUOperation(const GPUOperation&) = delete;
   GPUOperation& operator=(const GPUOperation&) = delete;
 
-  void AddOperation(GPUOperation* operation);
+  absl::Status AddOperation(GPUOperation* operation);
 
   void SetSrc(Tensor* ptr, int index = 0);
   void SetDst(Tensor* ptr, int index = 0);
@@ -122,6 +149,10 @@ class GPUOperation {
 
   Arguments args_;
   std::string code_;
+  int3 work_group_size_ = int3(8, 4, 1);
+  std::vector<CompilerOptions> compiler_options_;
+  // not applicable to elementwise
+  TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom;
 
   bool elementwise_ = false;
   // applicable only with elementwise_ = true;
@@ -138,12 +169,13 @@ class GPUOperation {
   std::vector<Tensor*> src_;
   std::vector<Tensor*> dst_;
   CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
   int3 grid_size_ = int3(0, 0, 0);
   std::vector<std::string> src_tensors_names_;
   std::vector<std::string> dst_tensors_names_;
-  std::vector<CompilerOptions> compiler_options_;
-  std::vector<GPUOperation*> linked_operations_;
+
+ private:
+  int linkable_count_ = 0;
+  std::string elementwise_code_;  // temporary, used during op construction
 };
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
index ec775861da7..c36dacdaafc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
@@ -32,7 +34,7 @@ std::string GetVectorReduceCode() {
 })";
 }
 
-std::string GetReduceCode(size_t work_group_size_x, size_t work_group_size_y) {
+std::string GetReduceCode() {
   // If it is supported, use the built-in work_group_reduce_add function.
   // Otherwise, implement a reduction using __local memory. Note this only works
   // with power-of-two work group sizes.
@@ -45,38 +47,41 @@ std::string GetReduceCode(size_t work_group_size_x, size_t work_group_size_y) {
 #ifdef __opencl_c_work_group_collective_functions
 #define local_reduce(input, tmp) work_group_reduce_add(input)
 #else  // !defined(__opencl_c_work_group_collective_functions)
-static inline float local_reduce(float input, __local float tmp[)" +
-         std::to_string(work_group_size_y) + "][" +
-         std::to_string(work_group_size_x) + R"(]) {
-  const size_t local_id_x = get_local_id(0);
-  const size_t local_id_y = get_local_id(1);
-  tmp[local_id_y][local_id_x] = input;
-  mem_fence(CLK_LOCAL_MEM_FENCE);
-  size_t reduction_size = get_local_size(0) / 2;
+static inline float local_reduce(float input, __local float* tmp) {
+  const int local_id = get_local_id(0);
+  tmp[local_id] = input;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  int reduction_size = get_local_size(0) / 2;
   while (reduction_size > 0) {
-    if (local_id_x < reduction_size) {
-      tmp[local_id_y][local_id_x] += tmp[local_id_y][local_id_x + reduction_size];
+    if (local_id < reduction_size) {
+      tmp[local_id] += tmp[local_id + reduction_size];
     }
-    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
     reduction_size /=  2;
   }
-  return tmp[local_id_y][0];
+  return tmp[0];
 }
 #endif  // defined(__opencl_c_work_group_collective_functions)
 )";
 }
 }  // namespace
 
-MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition)
+MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
+                                                 const DeviceInfo& device_info)
     : GPUOperation(definition) {
   // The kernel code does not inherently need a fixed size, but in order to not
   // hardcode the __local array's size for the reductions, we would need to pass
   // that size to the kernel at runtime, and that is currently not supported.
   // For now, fix workgroup size to 128 threads.
   work_group_size_.x = 128;
-  work_group_size_.y = 1;
-  work_group_size_.z = 1;
+  work_group_size_.y = 1;  // Required
+  work_group_size_.z = 1;  // Required
   code_ = GetNormalizationCode();
+  if (device_info.cl_version >= OpenCLVersion::CL_3_0) {
+    compiler_options_.push_back(CompilerOptions::CL_3_0);
+  } else if (device_info.cl_version >= OpenCLVersion::CL_2_0) {
+    compiler_options_.push_back(CompilerOptions::CL_2_0);
+  }
 }
 
 std::string MeanStdDevNormalization::GetNormalizationCode() {
@@ -85,19 +90,15 @@ std::string MeanStdDevNormalization::GetNormalizationCode() {
 
   std::string c = GetCommonDefines(definition_.precision);
   c += GetVectorReduceCode();
-  c += GetReduceCode(work_group_size_.x, work_group_size_.y);
+  c += GetReduceCode();
   c += "__attribute__((reqd_work_group_size(" +
-       std::to_string(work_group_size_.x) + ", " +
-       std::to_string(work_group_size_.y) + ", " +
-       std::to_string(work_group_size_.z) + ")))\n";
-  c += R"(__kernel void main_function(
-$0) {
+       std::to_string(work_group_size_.x) + ", 1, 1)))\n";
+  c += R"(__kernel void main_function($0) {
 #ifndef __opencl_c_work_group_collective_functions
   __local float tmp[)" +
-       std::to_string(work_group_size_.y) + "][" +
        std::to_string(work_group_size_.x) + R"(];
 #endif
-  size_t B = get_global_id(1);
+  const int B = get_global_id(1);
   if (get_global_id(2) > 0) { return; }
   if (B >= args.src_tensor.Batch()) { return; }
   // Calculate the total sum of the input tensor.
@@ -152,8 +153,8 @@ int3 MeanStdDevNormalization::GetGridSize() const {
 }
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition) {
-  return MeanStdDevNormalization(definition);
+    const OperationDef& definition, const DeviceInfo& device_info) {
+  return MeanStdDevNormalization(definition, device_info);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
index 47cc7ff46d1..e898803e377 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_LSTM_NORMALIZATION_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,7 +29,8 @@ namespace cl {
 // Implements tensor_utils::MeanStddevNormalization
 class MeanStdDevNormalization : public GPUOperation {
  public:
-  explicit MeanStdDevNormalization(const OperationDef& definition);
+  explicit MeanStdDevNormalization(const OperationDef& definition,
+                                   const DeviceInfo& device_info);
 
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
@@ -50,7 +52,7 @@ class MeanStdDevNormalization : public GPUOperation {
 };
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition);
+    const OperationDef& definition, const DeviceInfo& device_info);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
index 57f052557d4..8ff34be17d8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -54,7 +54,8 @@ TEST_P(MeanStddevNormalizationTest, SeparateBatches) {
       op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 dst_tensor;
-      auto operation = CreateMeanStdDevNormalization(op_def);
+      auto operation =
+          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     BHWC(1, 1, 1, 4), &dst_tensor));
 
@@ -72,18 +73,19 @@ TEST_P(MeanStddevNormalizationTest, SeparateBatches) {
   }
 }
 
+// note: 100.01 is not representable in FP16 (is in FP32), so use 101.0 instead.
 INSTANTIATE_TEST_SUITE_P(
     uKernels, MeanStddevNormalizationTest,
     testing::Values(
         std::make_tuple(0.0f, 0.0f, 0.0f),         // zero mean, zero variance
-        std::make_tuple(0.0f, 0.01f, 2.53e-5f),    // zero mean, small variance
-        std::make_tuple(0.0f, 100.0f, 1.20e-7f),   // zero mean, large variance
+        std::make_tuple(0.0f, 0.01f, 2.63e-4f),    // zero mean, small variance
+        std::make_tuple(0.0f, 100.0f, 2.63e-4f),   // zero mean, large variance
         std::make_tuple(0.01f, 0.0f, 0.0f),        // small mean, zero variance
-        std::make_tuple(0.01f, 0.01f, 2.53e-5f),   // small mean, small variance
-        std::make_tuple(0.01f, 100.0f, 1.20e-7f),  // small mean, large variance
+        std::make_tuple(0.01f, 0.01f, 3.57e-4f),   // small mean, small variance
+        std::make_tuple(1.0f, 100.0f, 2.63e-4f),   // small mean, large variance
         std::make_tuple(100.0f, 0.0f, 0.0f),       // large mean, zero variance
-        std::make_tuple(100.0f, 0.01f, 1.81e-4f),  // large mean, small variance
-        std::make_tuple(100.0f, 100.0f, 1.20e-7f)  // large mean, large variance
+        std::make_tuple(100.0f, 1.0f, 2.63e-4f),   // large mean, small variance
+        std::make_tuple(100.0f, 100.0f, 2.63e-4f)  // large mean, large variance
         ));
 
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MeanStddevNormalizationTest);
@@ -92,15 +94,15 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(9, 1, 1, 4);
   src_tensor.data = {
-      0.0f,     0.0f,    0.0f,    0.0f,     // zero mean, zero variance
-      -0.02f,   -0.01f,  0.01f,   0.02f,    // zero mean, small variance
-      -200.0f,  -100.0f, 100.0f,  200.0f,   // zero mean, large variance
-      0.01f,    0.01f,   0.01f,   0.01f,    // small mean, zero variance
-      -0.01f,   0.0f,    0.02f,   0.03f,    // small mean, small variance
-      -199.99f, -99.99f, 100.01f, 200.01f,  // small mean, large variance
-      100.0f,   100.0f,  100.0f,  100.0f,   // large mean, zero variance
-      99.98f,   99.99f,  100.01f, 100.02f,  // large mean, small variance
-      -100.0f,  0.0f,    200.0f,  300.0f,   // large mean, large variance
+      0.0f,    0.0f,    0.0f,   0.0f,    // zero mean, zero variance
+      -0.02f,  -0.01f,  0.01f,  0.02f,   // zero mean, small variance
+      -200.0f, -100.0f, 100.0f, 200.0f,  // zero mean, large variance
+      0.01f,   0.01f,   0.01f,  0.01f,   // small mean, zero variance
+      -0.01f,  0.0f,    0.02f,  0.03f,   // small mean, small variance
+      -199.0f, -99.0f,  101.0f, 201.0f,  // small mean, large variance
+      100.0f,  100.0f,  100.0f, 100.0f,  // large mean, zero variance
+      98.0f,   99.0f,   101.0f, 102.0f,  // large mean, small variance
+      -100.0f, 0.0f,    200.0f, 300.0f,  // large mean, large variance
   };
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
@@ -110,7 +112,8 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
       op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 dst_tensor;
-      auto operation = CreateMeanStdDevNormalization(op_def);
+      auto operation =
+          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     BHWC(9, 1, 1, 4), &dst_tensor));
 
@@ -128,7 +131,7 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, large variance
       };
       EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(1.81e-4f), expected_output));
+                  Pointwise(FloatNear(3.57e-4f), expected_output));
     }
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 4e2a6fb2bce..8012e601c0b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -24,29 +24,15 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Padding::Padding(const OperationDef& definition, const PadAttributes& attr)
-    : GPUOperation(definition) {
-  code_ = GetPaddingCode(definition_, attr);
-}
-
-Padding::Padding(Padding&& kernel) : GPUOperation(std::move(kernel)) {}
-
-Padding& Padding::operator=(Padding&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string Padding::GetPaddingCode(const OperationDef& op_def,
-                                    const PadAttributes& attr) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-  args_.AddInt("prepended_x", attr.prepended.w);
-  args_.AddInt("prepended_y", attr.prepended.h);
-  args_.AddInt("prepended_z", attr.prepended.c);
-  args_.AddInt("prepended_w", attr.prepended.b);
+namespace {
+std::string GetPaddingCode(const OperationDef& op_def,
+                           const PadAttributes& attr, GPUOperation* op) {
+  op->AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  op->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  op->args_.AddInt("prepended_x", attr.prepended.w);
+  op->args_.AddInt("prepended_y", attr.prepended.h);
+  op->args_.AddInt("prepended_z", attr.prepended.c);
+  op->args_.AddInt("prepended_w", attr.prepended.b);
 
   const std::string dst_batch =
       op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
@@ -149,16 +135,14 @@ std::string Padding::GetPaddingCode(const OperationDef& op_def,
   return c;
 }
 
-int3 Padding::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-Padding CreatePadding(const OperationDef& definition,
-                      const PadAttributes& attr) {
-  return Padding(definition, attr);
+GPUOperation CreatePadding(const OperationDef& definition,
+                           const PadAttributes& attr) {
+  GPUOperation op(definition);
+  op.code_ = GetPaddingCode(definition, attr, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
index 44d53204e16..81047162d20 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.h
@@ -25,24 +25,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Padding : public GPUOperation {
- public:
-  Padding(const OperationDef& definition, const PadAttributes& attr);
-  int3 GetGridSize() const override;
-
-  // Move only
-  Padding(Padding&& kernel);
-  Padding& operator=(Padding&& kernel);
-  Padding(const Padding&) = delete;
-  Padding& operator=(const Padding&) = delete;
-
- private:
-  std::string GetPaddingCode(const OperationDef& op_def,
-                             const PadAttributes& attr);
-};
-
-Padding CreatePadding(const OperationDef& definition,
-                      const PadAttributes& attr);
+GPUOperation CreatePadding(const OperationDef& definition,
+                           const PadAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
index a12183d4d65..426c23d8228 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, PaddingAppendWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -77,7 +77,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -105,7 +105,7 @@ TEST_F(OpenCLOperationTest, PaddingAppendHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -133,7 +133,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependHeight) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
@@ -161,7 +161,7 @@ TEST_F(OpenCLOperationTest, PaddingAppendChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 3), &dst_tensor));
       EXPECT_THAT(
@@ -189,7 +189,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 3), &dst_tensor));
       EXPECT_THAT(
@@ -217,7 +217,7 @@ TEST_F(OpenCLOperationTest, PaddingPrependChannelsX4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 6), &dst_tensor));
       EXPECT_THAT(
@@ -245,7 +245,7 @@ TEST_F(OpenCLOperationTest, PaddingComplex) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 3, 3), &dst_tensor));
       EXPECT_THAT(
@@ -277,7 +277,7 @@ TEST_F(OpenCLOperationTest, PaddingReflectWidth) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 7, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -306,7 +306,7 @@ TEST_F(OpenCLOperationTest, PaddingReflectChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Padding operation = CreatePadding(op_def, attr);
+      GPUOperation operation = CreatePadding(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 7), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
index 1ca2e096a0e..bcda1f6a628 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.cc
@@ -18,50 +18,75 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status CreatePReLU(const CreationContext& creation_context,
+GPUOperation CreatePReLU(const DeviceInfo& device_info,
                          const OperationDef& definition,
-                         const PReLUAttributes& attr, GPUOperation* result) {
-  *result = GPUOperation(definition);
-  result->elementwise_ = true;
+                         const PReLUAttributes& attr) {
+  GPUOperation result(definition);
+  result.elementwise_ = true;
+
+  std::string alpha_read;
+  auto alpha_linear =
+      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
+  if (alpha_linear) {
+    TensorLinearDescriptor desc;
+    desc.storage_type =
+        DeduceLinearStorageType(definition.GetPrimaryStorageType());
+    desc.element_type = definition.GetPrimaryDataType();
+    desc.UploadLinearData(*alpha_linear);
+    result.args_.AddObject(
+        "alpha", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+    alpha_read = "FLT4 alpha_val = args.alpha.Read(S_COORD);\n";
+  }
+
+  auto alpha_hwc =
+      absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.alpha);
+  if (alpha_hwc) {
+    const BHWC shape =
+        BHWC(1, alpha_hwc->shape.h, alpha_hwc->shape.w, alpha_hwc->shape.c);
+    TensorStorageType storage_type = SelectBestStorageType(
+        device_info, shape, definition.GetPrimaryStorageType(),
+        definition.GetDataType(), Layout::HWC);
+    TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
+    desc.UploadData(*alpha_hwc);
+    result.args_.AddObject(
+        "alpha", absl::make_unique<TensorDescriptor>(std::move(desc)));
+    const std::string x_coord = shape.w == 1 ? "0" : "X_COORD";
+    const std::string y_coord = shape.h == 1 ? "0" : "Y_COORD";
+    const std::string s_coord = shape.c == 1 ? "0" : "S_COORD";
+    alpha_read = absl::StrCat("FLT4 alpha_val = args.alpha.Read(", x_coord,
+                              ", ", y_coord, ", ", s_coord, ");\n");
+    if (shape.c == 1) {
+      alpha_read += "  alpha_val.y = alpha_val.x;\n";
+      alpha_read += "  alpha_val.z = alpha_val.x;\n";
+      alpha_read += "  alpha_val.w = alpha_val.x;\n";
+    }
+  }
+
   if (attr.clip != 0) {
     if (definition.precision == CalculationsPrecision::F32) {
-      result->args_.AddFloat("clip", attr.clip);
+      result.args_.AddFloat("clip", attr.clip);
     } else {
-      result->args_.AddHalf("clip", half(attr.clip));
+      result.args_.AddHalf("clip", half(attr.clip));
     }
-    result->code_ =
+    result.code_ =
+        alpha_read +
         "in_out_value = clamp(in_out_value, (FLT4)(0.0f), (FLT4)(args.clip)) + "
-        "min((FLT4)(0.0f), in_out_value) * args.alpha.Read(S_COORD);";
+        "min((FLT4)(0.0f), in_out_value) * alpha_val;";
   } else {
-    result->code_ =
+    result.code_ =
+        alpha_read +
         "in_out_value = max((FLT4)(0.0f), in_out_value) + min((FLT4)(0.0f), "
-        "in_out_value) * args.alpha.Read(S_COORD);";
+        "in_out_value) * alpha_val;";
   }
 
-  auto alpha =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.alpha);
-  if (!alpha) {
-    return absl::InvalidArgumentError("Alpha is missing");
-  }
-  TensorLinearDescriptor desc;
-  desc.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  desc.element_type = definition.GetPrimaryDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, *alpha, creation_context.context, &lt));
-  result->args_.AddObject("alpha", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-
-  return absl::OkStatus();
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index b673217c799..5d2a41bc6de 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -31,9 +31,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status CreatePReLU(const CreationContext& creation_context,
+GPUOperation CreatePReLU(const DeviceInfo& device_info,
                          const OperationDef& definition,
-                         const PReLUAttributes& attr, GPUOperation* result);
+                         const PReLUAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 06ff09ccca7..ef4b8c17324 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -52,8 +52,8 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
+      GPUOperation operation =
+          CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -83,8 +83,8 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation;
-      ASSERT_OK(CreatePReLU(creation_context_, op_def, attr, &operation));
+      GPUOperation operation =
+          CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -93,6 +93,37 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
   }
 }
 
+TEST_F(OpenCLOperationTest, PReLUHWCAlpha) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
+
+  PReLUAttributes attr;
+  ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
+  hwc_tensor.shape = HWC(2, 1, 2);
+  hwc_tensor.data = {0.5f, -2.0f, 0.7f, 4.7f};
+  attr.alpha = hwc_tensor;
+  attr.clip = 0.0;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreatePReLU(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 2.0f, -1.4f, 3.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
index e0c44e1cda7..1e08eb0ff52 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@@ -26,7 +26,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 GPUOperation CreateQuantizeAndDequantize(
-    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationDef& definition,
     const QuantizeAndDequantizeAttributes& attr) {
   QuantizeAndDequantizeAttributes adjusted_attr = attr;
   const bool is_fp16 = definition.precision == CalculationsPrecision::F16 ||
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
index 6e028625852..1e37e427af8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@@ -44,7 +44,7 @@ namespace cl {
 // NOTE: We do not need to nudge min/max values in this op, since they would
 // already be adjusted while generating the quantized model.
 GPUOperation CreateQuantizeAndDequantize(
-    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationDef& definition,
     const QuantizeAndDequantizeAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
index 43b5d69323d..40087ad82d3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@@ -56,8 +56,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -91,8 +90,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -126,8 +124,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -161,8 +158,7 @@ TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation =
-          CreateQuantizeAndDequantize(creation_context_, op_def, attr);
+      GPUOperation operation = CreateQuantizeAndDequantize(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 2, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
index a80dccd6259..5ed06173a89 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.cc
@@ -21,8 +21,7 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-GPUOperation CreateReLU(const CreationContext& creation_context,
-                        const OperationDef& definition,
+GPUOperation CreateReLU(const OperationDef& definition,
                         const ReLUAttributes& attr) {
   GPUOperation op(definition);
   op.elementwise_ = true;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
index 001e23da41c..1b4e3a81605 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu.h
@@ -25,8 +25,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-GPUOperation CreateReLU(const CreationContext& creation_context,
-                        const OperationDef& definition,
+GPUOperation CreateReLU(const OperationDef& definition,
                         const ReLUAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
index f741a408661..1860986d7e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/relu_test.cc
@@ -49,7 +49,7 @@ TEST_F(OpenCLOperationTest, ReLUNoClipNoAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -76,7 +76,7 @@ TEST_F(OpenCLOperationTest, ReLUClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -103,7 +103,7 @@ TEST_F(OpenCLOperationTest, ReLUAlpha) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -130,7 +130,7 @@ TEST_F(OpenCLOperationTest, ReLUAlphaClip) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      GPUOperation operation = CreateReLU(creation_context_, op_def, attr);
+      GPUOperation operation = CreateReLU(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index 4e2ab1307a5..d965b6f0611 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -23,24 +23,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Reshape::Reshape(const OperationDef& definition) : GPUOperation(definition) {
-  code_ = GetReshapeCode(definition_);
-}
-
-Reshape::Reshape(Reshape&& operation) : GPUOperation(std::move(operation)) {}
-
-Reshape& Reshape::operator=(Reshape&& operation) {
-  if (this != &operation) {
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
+namespace {
+std::string GetReshapeCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -96,15 +80,15 @@ std::string Reshape::GetReshapeCode(const OperationDef& op_def) {
   return c;
 }
 
-int3 Reshape::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-Reshape CreateReshape(const OperationDef& definition) {
-  return Reshape(definition);
+GPUOperation CreateReshape(const OperationDef& definition) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetReshapeCode(definition);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
index a5da616c451..59cc5c1560d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.h
@@ -25,23 +25,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Reshape : public GPUOperation {
- public:
-  explicit Reshape(const OperationDef& definition);
-
-  int3 GetGridSize() const override;
-
-  // Move only
-  Reshape(Reshape&& operation);
-  Reshape& operator=(Reshape&& operation);
-  Reshape(const Reshape&) = delete;
-  Reshape& operator=(const Reshape&) = delete;
-
- private:
-  std::string GetReshapeCode(const OperationDef& op_def);
-};
-
-Reshape CreateReshape(const OperationDef& definition);
+GPUOperation CreateReshape(const OperationDef& definition);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
index 8f08eaee4fb..d83acd9b454 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape_test.cc
@@ -45,7 +45,7 @@ TEST_F(OpenCLOperationTest, Reshape) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Reshape operation = CreateReshape(op_def);
+      GPUOperation operation = CreateReshape(op_def);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 3, 1, 2), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index e5692cbc736..78440e3c843 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -23,26 +23,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
 
-Reshapex4::Reshapex4(const OperationDef& definition)
-    : GPUOperation(definition) {
-  code_ = GetReshapeCode(definition_);
-}
-
-Reshapex4::Reshapex4(Reshapex4&& operation)
-    : GPUOperation(std::move(operation)) {}
-
-Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
-  if (this != &operation) {
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
+std::string GetReshapeCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -82,15 +65,15 @@ std::string Reshapex4::GetReshapeCode(const OperationDef& op_def) {
   return c;
 }
 
-int3 Reshapex4::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
+}  // namespace
 
-Reshapex4 CreateReshapex4(const OperationDef& definition) {
-  return Reshapex4(definition);
+GPUOperation CreateReshapex4(const OperationDef& definition) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetReshapeCode(definition);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
index 654e37e93be..2052d45b3e1 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h
@@ -26,24 +26,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Reshapex4 : public GPUOperation {
- public:
-  explicit Reshapex4(const OperationDef& definition);
-
-  int3 GetGridSize() const override;
-
-  // Move only
-  Reshapex4(Reshapex4&& operation);
-  Reshapex4& operator=(Reshapex4&& operation);
-  Reshapex4(const Reshapex4&) = delete;
-  Reshapex4& operator=(const Reshapex4&) = delete;
-
- private:
-  std::string GetReshapeCode(const OperationDef& op_def);
-};
-
 // More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
-Reshapex4 CreateReshapex4(const OperationDef& definition);
+GPUOperation CreateReshapex4(const OperationDef& definition);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
index 65b88a94218..635380bf150 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4_test.cc
@@ -45,7 +45,7 @@ TEST_F(OpenCLOperationTest, Reshapex4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Reshapex4 operation = CreateReshapex4(op_def);
+      GPUOperation operation = CreateReshapex4(op_def);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 2, 4), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
index be8e979305b..03a53d5716b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.cc
@@ -24,32 +24,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Softmax::Softmax(const OperationDef& definition) : GPUOperation(definition) {
-  code_ = GetSoftmaxKernelCode(definition_);
-}
-
-Softmax::Softmax(Softmax&& kernel) : GPUOperation(std::move(kernel)) {}
-
-Softmax& Softmax::operator=(Softmax&& kernel) {
-  if (this != &kernel) {
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
-  auto src_desc = op_def.src_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
+namespace {
+std::string GetSoftmaxKernelCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -74,16 +50,23 @@ std::string Softmax::GetSoftmaxKernelCode(const OperationDef& op_def) {
   c += "}\n";
   return c;
 }
+}  // namespace
 
-int3 Softmax::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = 1;
-  return int3(grid_x, grid_y, grid_z);
-}
-
-Softmax CreateSoftmax(const OperationDef& definition) {
-  return Softmax(definition);
+GPUOperation CreateSoftmax(const OperationDef& definition) {
+  GPUOperation op(definition);
+  auto src_desc = definition.src_tensors[0];
+  if (definition.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = definition.dst_tensors[0];
+  if (definition.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddDstTensor("dst_tensor", dst_desc);
+  op.code_ = GetSoftmaxKernelCode(definition);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
index 0fa10721df9..17a264766d4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax.h
@@ -26,26 +26,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Softmax : public GPUOperation {
- public:
-  Softmax() = default;
-  explicit Softmax(const OperationDef& definition);
-
-  int3 GetGridSize() const override;
-
-  // Move only
-  Softmax(Softmax&& kernel);
-  Softmax& operator=(Softmax&& kernel);
-  Softmax(const Softmax&) = delete;
-  Softmax& operator=(const Softmax&) = delete;
-
-  friend Softmax CreateSoftmax();
-
- private:
-  std::string GetSoftmaxKernelCode(const OperationDef& op_def);
-};
-
-Softmax CreateSoftmax(const OperationDef& definition);
+GPUOperation CreateSoftmax(const OperationDef& definition);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
index bab81432248..d201baaa8ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax_test.cc
@@ -48,7 +48,7 @@ TEST_F(OpenCLOperationTest, Softmax) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Softmax operation = CreateSoftmax(op_def);
+      GPUOperation operation = CreateSoftmax(op_def);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
index 0fa266aa8e7..f5323b48bae 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -25,29 +25,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-SpaceToDepth::SpaceToDepth(const OperationDef& op_def,
-                           const SpaceToDepthAttributes& attr)
-    : GPUOperation(op_def), attr_(attr) {
-  code_ = GetSpaceToDepthCode(definition_);
-}
-
-SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
-    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
-
-SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-  args_.AddInt("block_size");
-
+namespace {
+std::string GetSpaceToDepthCode(const OperationDef& op_def) {
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
@@ -87,22 +66,17 @@ std::string SpaceToDepth::GetSpaceToDepthCode(const OperationDef& op_def) {
   c += "}\n";
   return c;
 }
+}  // namespace
 
-absl::Status SpaceToDepth::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("block_size", attr_.block_size));
-  return absl::OkStatus();
-}
-
-int3 SpaceToDepth::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+GPUOperation CreateSpaceToDepth(const OperationDef& op_def,
                                 const SpaceToDepthAttributes& attr) {
-  return SpaceToDepth(op_def, attr);
+  GPUOperation op(op_def);
+  op.AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  op.args_.AddInt("block_size", attr.block_size);
+  op.code_ = GetSpaceToDepthCode(op_def);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
index 65ade000836..08aca3054d6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -26,24 +26,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class SpaceToDepth : public GPUOperation {
- public:
-  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr);
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  SpaceToDepth(SpaceToDepth&& operation);
-  SpaceToDepth& operator=(SpaceToDepth&& operation);
-  SpaceToDepth(const SpaceToDepth&) = delete;
-  SpaceToDepth& operator=(const SpaceToDepth&) = delete;
-
- private:
-  std::string GetSpaceToDepthCode(const OperationDef& op_def);
-
-  SpaceToDepthAttributes attr_;
-};
-
-SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+GPUOperation CreateSpaceToDepth(const OperationDef& op_def,
                                 const SpaceToDepthAttributes& attr);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
index 02d93582ede..8298d14f7d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
@@ -69,7 +69,7 @@ TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x2BlockSize2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 8), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -95,7 +95,7 @@ TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x3BlockSize2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 12), &dst_tensor));
       EXPECT_THAT(
@@ -124,7 +124,7 @@ TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x4x4x1BlockSize2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      GPUOperation operation = CreateSpaceToDepth(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 4), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
index e95e758fc95..f451d09d32d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.cc
@@ -26,34 +26,10 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
-    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr)
-    : GPUOperation(definition), dw_attr_(dw_attr) {
-  work_group_size_ = int3(8, 8, 1);
-  code_ = GenerateCode(definition_, dw_attr_,
-                       DivideRoundUp(conv_attr.weights.shape.o, 4));
-}
-
-DepthwiseConvPlus1x1Conv::DepthwiseConvPlus1x1Conv(
-    DepthwiseConvPlus1x1Conv&& operation)
-    : GPUOperation(std::move(operation)),
-      dw_attr_(std::move(operation.dw_attr_)) {}
-
-DepthwiseConvPlus1x1Conv& DepthwiseConvPlus1x1Conv::operator=(
-    DepthwiseConvPlus1x1Conv&& operation) {
-  if (this != &operation) {
-    dw_attr_ = std::move(operation.dw_attr_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(
-    const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr, CLContext* context) {
+namespace {
+void UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
+                   const Convolution2DAttributes& conv_attr,
+                   CalculationsPrecision precision, GPUOperation* op) {
   int dw_dst_ch_aligned = AlignByN(dw_attr.weights.shape.i, 4);
   int dw_weights_count =
       dw_dst_ch_aligned * dw_attr.weights.shape.h * dw_attr.weights.shape.w;
@@ -116,47 +92,41 @@ absl::Status DepthwiseConvPlus1x1Conv::UploadWeights(
     }
   }
 
-  Buffer constants_buf;
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
   const int float_size = fp32_weights ? 4 : 2;
-  if (fp32_weights) {
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data.size(),
-                                         gpu_data.data(), context,
-                                         &constants_buf));
-  } else {
-    std::vector<half> gpu_data_half(gpu_data.size());
-    for (int i = 0; i < gpu_data.size(); ++i) {
-      gpu_data_half[i] = gpu_data[i];
-    }
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float_size * gpu_data_half.size(),
-                                         gpu_data_half.data(), context,
-                                         &constants_buf));
-  }
-
   BufferDescriptor desc;
   desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
   desc.element_size = 4;
   desc.memory_type = MemoryType::CONSTANT;
-  args_.AddObject("constants", AccessType::READ,
-                  absl::make_unique<Buffer>(std::move(constants_buf)),
-                  absl::make_unique<BufferDescriptor>(desc));
-  return absl::OkStatus();
+  desc.size = float_size * gpu_data.size();
+  desc.data.resize(desc.size);
+
+  if (fp32_weights) {
+    memcpy(desc.data.data(), gpu_data.data(), desc.size);
+  } else {
+    half* gpu_data_half = reinterpret_cast<half*>(desc.data.data());
+    for (int i = 0; i < gpu_data.size(); ++i) {
+      gpu_data_half[i] = gpu_data[i];
+    }
+  }
+  op->args_.AddObject("constants",
+                      absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
-std::string DepthwiseConvPlus1x1Conv::GenerateCode(
-    const OperationDef& op_def, const DepthwiseConvolution2DAttributes& dw_attr,
-    int result_depth) {
+std::string GenerateCode(const OperationDef& op_def,
+                         const DepthwiseConvolution2DAttributes& dw_attr,
+                         int result_depth, GPUOperation* result) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  AddSrcTensor("src_tensor", src_desc);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+  result->AddSrcTensor("src_tensor", src_desc);
+  result->AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
-  args_.AddInt("stride_x", dw_attr.strides.w);
-  args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
-  args_.AddInt("dilation_x", dw_attr.dilations.w);
-  args_.AddInt("stride_y", dw_attr.strides.h);
-  args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
-  args_.AddInt("dilation_y", dw_attr.dilations.h);
+  result->args_.AddInt("stride_x", dw_attr.strides.w);
+  result->args_.AddInt("padding_x", -dw_attr.padding.prepended.w);
+  result->args_.AddInt("dilation_x", dw_attr.dilations.w);
+  result->args_.AddInt("stride_y", dw_attr.strides.h);
+  result->args_.AddInt("padding_y", -dw_attr.padding.prepended.h);
+  result->args_.AddInt("dilation_y", dw_attr.dilations.h);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
@@ -241,14 +211,10 @@ std::string DepthwiseConvPlus1x1Conv::GenerateCode(
   return c;
 }
 
-int3 DepthwiseConvPlus1x1Conv::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  return int3(grid_x, grid_y, 1);
-}
+}  // namespace
 
 bool IsDepthwiseConvPlus1x1ConvSupported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
     const Convolution2DAttributes& conv_attr) {
   const auto dw_shape = dw_attr.weights.shape;
@@ -267,15 +233,17 @@ bool IsDepthwiseConvPlus1x1ConvSupported(
   return good_dw && good_conv && recommended_dw && recommended_conv;
 }
 
-absl::Status CreateDepthwiseConvPlus1x1Conv(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateDepthwiseConvPlus1x1Conv(
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr,
-    DepthwiseConvPlus1x1Conv* result) {
-  *result = DepthwiseConvPlus1x1Conv(definition, dw_attr, conv_attr);
-  RETURN_IF_ERROR(
-      result->UploadWeights(dw_attr, conv_attr, creation_context.context));
-  return absl::OkStatus();
+    const Convolution2DAttributes& conv_attr) {
+  GPUOperation result(definition);
+  result.code_ =
+      GenerateCode(definition, dw_attr,
+                   DivideRoundUp(conv_attr.weights.shape.o, 4), &result);
+  result.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+  UploadWeights(dw_attr, conv_attr, definition.precision, &result);
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
index b2d3b05d285..b87051104b7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h
@@ -33,47 +33,15 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class DepthwiseConvPlus1x1Conv : public GPUOperation {
- public:
-  DepthwiseConvPlus1x1Conv() = default;
-  int3 GetGridSize() const override;
-
-  // Move only
-  DepthwiseConvPlus1x1Conv(DepthwiseConvPlus1x1Conv&& operation);
-  DepthwiseConvPlus1x1Conv& operator=(DepthwiseConvPlus1x1Conv&& operation);
-  DepthwiseConvPlus1x1Conv(const DepthwiseConvPlus1x1Conv&) = delete;
-  DepthwiseConvPlus1x1Conv& operator=(const DepthwiseConvPlus1x1Conv&) = delete;
-
- private:
-  friend absl::Status CreateDepthwiseConvPlus1x1Conv(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& dw_attr,
-      const Convolution2DAttributes& conv_attr,
-      DepthwiseConvPlus1x1Conv* result);
-  DepthwiseConvPlus1x1Conv(const OperationDef& definition,
-                           const DepthwiseConvolution2DAttributes& dw_attr,
-                           const Convolution2DAttributes& conv_attr);
-
-  absl::Status UploadWeights(const DepthwiseConvolution2DAttributes& dw_attr,
-                             const Convolution2DAttributes& conv_attr,
-                             CLContext* context);
-
-  std::string GenerateCode(const OperationDef& op_def,
-                           const DepthwiseConvolution2DAttributes& dw_attr,
-                           int result_depth);
-
-  DepthwiseConvolution2DAttributes dw_attr_;
-};
-
 bool IsDepthwiseConvPlus1x1ConvSupported(
-    const CLDevice& device, const OperationDef& definition,
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
     const Convolution2DAttributes& conv_attr);
 
-absl::Status CreateDepthwiseConvPlus1x1Conv(
-    const CreationContext& creation_context, const OperationDef& definition,
+GPUOperation CreateDepthwiseConvPlus1x1Conv(
+    const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& dw_attr,
-    const Convolution2DAttributes& conv_attr, DepthwiseConvPlus1x1Conv* result);
+    const Convolution2DAttributes& conv_attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 259f66e0f38..0182ec7d90c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -24,29 +24,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-
-Transpose::Transpose(const OperationDef& definition,
-                     const TransposeAttributes& attr)
-    : GPUOperation(definition), attr_(attr) {
-  code_ = GetTransposeCode(definition_, attr_);
-}
-
-Transpose::Transpose(Transpose&& operation)
-    : GPUOperation(std::move(operation)), attr_(operation.attr_) {}
-
-Transpose& Transpose::operator=(Transpose&& operation) {
-  if (this != &operation) {
-    attr_ = operation.attr_;
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string Transpose::GetTransposeCode(const OperationDef& op_def,
-                                        const TransposeAttributes& attr) {
-  AddSrcTensor("src_tensor", op_def.src_tensors[0]);
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
+namespace {
+std::string GetTransposeCode(const OperationDef& op_def,
+                             const TransposeAttributes& attr) {
   const std::string batch_id =
       op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
@@ -112,17 +92,16 @@ std::string Transpose::GetTransposeCode(const OperationDef& op_def,
   c += "}\n";
   return c;
 }
+}  // namespace
 
-int3 Transpose::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-Transpose CreateTranspose(const OperationDef& definition,
-                          const TransposeAttributes& attr) {
-  return Transpose(definition, attr);
+GPUOperation CreateTranspose(const OperationDef& definition,
+                             const TransposeAttributes& attr) {
+  GPUOperation op(definition);
+  op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+  op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+  op.code_ = GetTransposeCode(definition, attr);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 950f838923e..631d5dc08b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -24,25 +24,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Transpose : public GPUOperation {
- public:
-  Transpose(const OperationDef& definition, const TransposeAttributes& attr);
-  int3 GetGridSize() const override;
-
-  // Move only
-  Transpose(Transpose&& operation);
-  Transpose& operator=(Transpose&& operation);
-  Transpose(const Transpose&) = delete;
-  Transpose& operator=(const Transpose&) = delete;
-
- private:
-  std::string GetTransposeCode(const OperationDef& op_def,
-                               const TransposeAttributes& attr);
-  TransposeAttributes attr_;
-};
-
-Transpose CreateTranspose(const OperationDef& definition,
-                          const TransposeAttributes& attr);
+GPUOperation CreateTranspose(const OperationDef& definition,
+                             const TransposeAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
index 07e1b9d58aa..1d1fba237a5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose_test.cc
@@ -48,7 +48,7 @@ TEST_F(OpenCLOperationTest, Transpose) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Transpose operation = CreateTranspose(op_def, attr);
+      GPUOperation operation = CreateTranspose(op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 3, 2), &dst_tensor));
       EXPECT_THAT(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 3af4c658ce2..0f94847f08a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -234,7 +234,7 @@ std::string Winograd4x4To36::GetWinograd4x4To36Code(
   return c;
 }
 
-absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
+void Winograd4x4To36::UploadBt() {
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
   bt_aligned.shape = Linear(6 * 8);
   bt_aligned.data.resize(6 * 8);
@@ -250,13 +250,9 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
-  args_.AddObject("bt", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(bt_aligned);
+  args_.AddObject("bt",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
@@ -301,13 +297,12 @@ void Winograd4x4To36::GetPossibleKernelWorkGroups(
   }
 }
 
-absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Padding2D& padding,
-                                   Winograd4x4To36* result) {
-  *result =
-      Winograd4x4To36(definition, padding, creation_context.device->info_);
-  return result->UploadBt(creation_context.context);
+Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
+                                      const OperationDef& definition,
+                                      const Padding2D& padding) {
+  Winograd4x4To36 result(definition, padding, device_info);
+  result.UploadBt();
+  return result;
 }
 
 Winograd36To4x4::Winograd36To4x4(const OperationDef& definition,
@@ -440,7 +435,7 @@ std::string Winograd36To4x4::GetWinograd36To4x4Code(
   return c;
 }
 
-absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
+void Winograd36To4x4::UploadAt() {
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
   at_aligned.shape = Linear(4 * 8);
   at_aligned.data.resize(4 * 8);
@@ -456,12 +451,9 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition_.GetDataType();
-  LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(desc, at_aligned, context, &lt));
-  args_.AddObject("at", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)),
-                  absl::make_unique<TensorLinearDescriptor>(desc));
-  return absl::OkStatus();
+  desc.UploadLinearData(at_aligned);
+  args_.AddObject("at",
+                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
 }
 
 int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
@@ -472,8 +464,6 @@ int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
 }
 
 absl::Status Winograd36To4x4::BindArguments() {
-  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
-  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
   RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
   return absl::OkStatus();
@@ -503,21 +493,18 @@ void Winograd36To4x4::GetPossibleKernelWorkGroups(
   }
 }
 
-absl::Status CreateWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    Winograd36To4x4* result) {
-  *result = Winograd36To4x4(definition, creation_context.device->info_);
+Winograd36To4x4 CreateWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
+  Winograd36To4x4 result(definition, device_info);
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
-  LinearStorage lt;
-  RETURN_IF_ERROR(
-      CreateLinearStorage(desc, biases, creation_context.context, &lt));
-  result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)),
-                          absl::make_unique<TensorLinearDescriptor>(desc));
-  return result->UploadAt(creation_context.context);
+  desc.UploadLinearData(biases);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  result.UploadAt();
+  return result;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index 08153f1d8aa..a5da49e7939 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -50,11 +50,11 @@ class Winograd4x4To36 : public GPUOperation {
   Winograd4x4To36& operator=(const Winograd4x4To36&) = delete;
 
  private:
-  friend absl::Status CreateWinograd4x4To36(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const Padding2D& padding, Winograd4x4To36* result);
+  friend Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
+                                               const OperationDef& definition,
+                                               const Padding2D& padding);
 
-  absl::Status UploadBt(CLContext* context);
+  void UploadBt();
 
   std::string GetWinograd4x4To36Code(const OperationDef& op_def);
 
@@ -64,10 +64,9 @@ class Winograd4x4To36 : public GPUOperation {
   Padding2D padding_;
 };
 
-absl::Status CreateWinograd4x4To36(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const Padding2D& padding,
-                                   Winograd4x4To36* result);
+Winograd4x4To36 CreateWinograd4x4To36(const DeviceInfo& device_info,
+                                      const OperationDef& definition,
+                                      const Padding2D& padding);
 
 class Winograd36To4x4 : public GPUOperation {
  public:
@@ -88,12 +87,11 @@ class Winograd36To4x4 : public GPUOperation {
   Winograd36To4x4& operator=(const Winograd36To4x4&) = delete;
 
  private:
-  friend absl::Status CreateWinograd36To4x4(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-      Winograd36To4x4* result);
+  friend Winograd36To4x4 CreateWinograd36To4x4(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
 
-  absl::Status UploadAt(CLContext* context);
+  void UploadAt();
 
   std::string GetWinograd36To4x4Code(const OperationDef& op_def);
 
@@ -101,10 +99,9 @@ class Winograd36To4x4 : public GPUOperation {
   int3 SelectBestWorkGroup(const KernelInfo& kernel_info) const;
 };
 
-absl::Status CreateWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    Winograd36To4x4* result);
+Winograd36To4x4 CreateWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index 1dada33ae04..6e32de3cba9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -93,9 +93,8 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
       Padding2D padding;
       padding.prepended = HW(1, 1);
       padding.appended = HW(1, 1);
-      Winograd4x4To36 wino_up;
-      ASSERT_OK(
-          CreateWinograd4x4To36(creation_context_, op_def, padding, &wino_up));
+      Winograd4x4To36 wino_up = CreateWinograd4x4To36(
+          creation_context_.GetDeviceInfo(), op_def, padding);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
                                     BHWC(1, 36, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
@@ -162,9 +161,8 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      Winograd36To4x4 wino_down;
-      ASSERT_OK(
-          CreateWinograd36To4x4(creation_context_, op_def, biases, &wino_down));
+      Winograd36To4x4 wino_down = CreateWinograd36To4x4(
+          creation_context_.GetDeviceInfo(), op_def, biases);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 0ff17d0e3de..75920f4f8c5 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -23,6 +23,29 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+TensorLinearDescriptor::TensorLinearDescriptor(TensorLinearDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      storage_type(desc.storage_type),
+      element_type(desc.element_type),
+      memory_type(desc.memory_type),
+      size(desc.size),
+      data(std::move(desc.data)) {}
+
+TensorLinearDescriptor& TensorLinearDescriptor::operator=(
+    TensorLinearDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(storage_type, desc.storage_type);
+    std::swap(element_type, desc.element_type);
+    std::swap(memory_type, desc.memory_type);
+    std::swap(size, desc.size);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
+void TensorLinearDescriptor::Release() { data.clear(); }
+
 GPUResources TensorLinearDescriptor::GetGPUResources() const {
   GPUResources resources;
   resources.ints.push_back("length");
@@ -81,20 +104,60 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
   }
 }
 
-LinearStorage::LinearStorage(int depth, LinearStorageType storage_type)
-    : depth_(depth), storage_type_(storage_type) {}
+absl::Status TensorLinearDescriptor::CreateGPUObject(
+    CLContext* context, GPUObjectPtr* result) const {
+  LinearStorage gpu_storage;
+  RETURN_IF_ERROR(gpu_storage.CreateFromTensorLinearDescriptor(*this, context));
+  *result = absl::make_unique<LinearStorage>(std::move(gpu_storage));
+  return absl::OkStatus();
+}
+
+void TensorLinearDescriptor::UploadLinearData(
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src,
+    int aligned_size) {
+  size = aligned_size == 0 ? DivideRoundUp(src.shape.v, 4) : aligned_size;
+  if (element_type == DataType::FLOAT32) {
+    data.resize(size * sizeof(float) * 4);
+    float* gpu_data = reinterpret_cast<float*>(data.data());
+    for (int i = 0; i < size * 4; ++i) {
+      if (i < src.shape.v) {
+        gpu_data[i] = src.data[i];
+      } else {
+        gpu_data[i] = 0.0f;
+      }
+    }
+  } else {
+    data.resize(size * sizeof(half) * 4);
+    half* gpu_data = reinterpret_cast<half*>(data.data());
+    for (int i = 0; i < size * 4; ++i) {
+      if (i < src.shape.v) {
+        gpu_data[i] = src.data[i];
+      } else {
+        gpu_data[i] = 0.0f;
+      }
+    }
+  }
+}
+
+void LinearStorage::Release() {
+  if (memory_) {
+    clReleaseMemObject(memory_);
+    memory_ = nullptr;
+  }
+}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
     : GPUObject(std::move(storage)),
-      texture_storage_(std::move(storage.texture_storage_)),
-      buffer_storage_(std::move(storage.buffer_storage_)),
+      memory_(storage.memory_),
       depth_(storage.depth_),
-      storage_type_(storage.storage_type_) {}
+      storage_type_(storage.storage_type_) {
+  storage.memory_ = nullptr;
+}
 
 LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
   if (this != &storage) {
-    texture_storage_ = std::move(storage.texture_storage_);
-    buffer_storage_ = std::move(storage.buffer_storage_);
+    Release();
+    std::swap(memory_, storage.memory_);
     std::swap(depth_, storage.depth_);
     std::swap(storage_type_, storage.storage_type_);
     GPUObject::operator=(std::move(storage));
@@ -115,14 +178,37 @@ absl::Status LinearStorage::GetGPUResources(
   resources->ints.push_back({"length", depth_});
 
   if (storage_type_ == LinearStorageType::BUFFER) {
-    resources->buffers.push_back({"buffer", buffer_storage_.GetMemoryPtr()});
+    resources->buffers.push_back({"buffer", memory_});
   } else {
-    resources->images2d.push_back({"tex2d", texture_storage_.GetMemoryPtr()});
+    resources->images2d.push_back({"tex2d", memory_});
   }
 
   return absl::OkStatus();
 }
 
+absl::Status LinearStorage::CreateFromTensorLinearDescriptor(
+    const TensorLinearDescriptor& desc, CLContext* context) {
+  storage_type_ = desc.storage_type;
+  depth_ = desc.size;
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    bool read_only = desc.memory_type == MemoryType::CONSTANT;
+    uint8_t* data_ptr = desc.data.empty()
+                            ? nullptr
+                            : const_cast<unsigned char*>(desc.data.data());
+    const int float4_size = desc.element_type == DataType::FLOAT32
+                                ? sizeof(float) * 4
+                                : sizeof(half) * 4;
+    return CreateCLBuffer(context->context(), depth_ * float4_size, read_only,
+                          data_ptr, &memory_);
+  } else {
+    return CreateFloatRGBAImage2D(context->context(), depth_, 1,
+                                  desc.element_type, data_ptr, &memory_);
+  }
+}
+
 LinearStorageType DeduceLinearStorageType(
     TensorStorageType tensor_storage_type) {
   if (tensor_storage_type == TensorStorageType::BUFFER) {
@@ -132,24 +218,6 @@ LinearStorageType DeduceLinearStorageType(
   }
 }
 
-absl::Status CreateLinearStorage(LinearStorageType storage_type,
-                                 DataType data_type, int size, void* data,
-                                 CLContext* context, LinearStorage* result) {
-  if (storage_type == LinearStorageType::BUFFER) {
-    const int float4_size =
-        data_type == DataType::FLOAT32 ? sizeof(float4) : sizeof(half4);
-    *result = LinearStorage(size, LinearStorageType::BUFFER);
-    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * size, data, context,
-                                         &result->buffer_storage_));
-    return absl::OkStatus();
-  } else {
-    *result = LinearStorage(size, LinearStorageType::TEXTURE_2D);
-    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, size, 1, data, context,
-                                        &result->texture_storage_));
-    return absl::OkStatus();
-  }
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index b69f76b9c1a..37e7f12dfb3 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -21,11 +21,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -42,6 +40,20 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   DataType element_type;  // FLOAT32 or FLOAT16
   MemoryType memory_type = MemoryType::GLOBAL;  // applicable for BUFFER
 
+  // optional
+  int size = 0;
+  std::vector<uint8_t> data;
+
+  TensorLinearDescriptor() = default;
+  TensorLinearDescriptor(const TensorLinearDescriptor&) = default;
+  TensorLinearDescriptor& operator=(const TensorLinearDescriptor&) = default;
+  TensorLinearDescriptor(TensorLinearDescriptor&& desc);
+  TensorLinearDescriptor& operator=(TensorLinearDescriptor&& desc);
+
+  void UploadLinearData(
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src,
+      int aligned_size = 0);
+
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -50,6 +62,10 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
+
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override;
 };
 
 LinearStorageType DeduceLinearStorageType(
@@ -60,8 +76,7 @@ LinearStorageType DeduceLinearStorageType(
 class LinearStorage : public GPUObject {
  public:
   LinearStorage() {}
-
-  virtual ~LinearStorage() {}
+  ~LinearStorage() override { Release(); }
 
   // Move only
   LinearStorage(LinearStorage&& storage);
@@ -72,46 +87,17 @@ class LinearStorage : public GPUObject {
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
+  absl::Status CreateFromTensorLinearDescriptor(
+      const TensorLinearDescriptor& desc, CLContext* context);
+
  private:
-  friend absl::Status CreateLinearStorage(LinearStorageType storage_type,
-                                          DataType data_type, int size,
-                                          void* data, CLContext* context,
-                                          LinearStorage* result);
-
-  LinearStorage(int depth, LinearStorageType storage_type);
-
-  Texture2D texture_storage_;
-  Buffer buffer_storage_;
+  void Release();
 
+  cl_mem memory_ = nullptr;
   int depth_;
   LinearStorageType storage_type_;
 };
 
-absl::Status CreateLinearStorage(LinearStorageType storage_type,
-                                 DataType data_type, int size, void* data,
-                                 CLContext* context, LinearStorage* result);
-
-template <DataType T>
-absl::Status CreateLinearStorage(const TensorLinearDescriptor& descriptor,
-                                 const tflite::gpu::Tensor<Linear, T>& tensor,
-                                 CLContext* context, LinearStorage* result) {
-  const int depth = DivideRoundUp(tensor.shape.v, 4);
-  if (descriptor.element_type == DataType::FLOAT32) {
-    std::vector<float4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(descriptor.storage_type,
-                                        descriptor.element_type, depth,
-                                        gpu_data.data(), context, result));
-  } else {
-    std::vector<half4> gpu_data(depth);
-    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
-    RETURN_IF_ERROR(CreateLinearStorage(descriptor.storage_type,
-                                        descriptor.element_type, depth,
-                                        gpu_data.data(), context, result));
-  }
-  return absl::OkStatus();
-}
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 7ea0ac35f89..3e2b8855af9 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -49,6 +49,7 @@ cc_library(
     hdrs = ["default_selector.h"],
     deps = [
         ":subgraph",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index 4a97bdddd09..eab957e28a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -30,218 +30,171 @@ namespace gpu {
 namespace cl {
 namespace {
 
-absl::Status SelectConvolutionAdreno(const Convolution2DAttributes& attr,
-                                     const BHWC& dst_shape,
-                                     const CreationContext& creation_context,
-                                     const OperationDef& op_def,
-                                     ModelHints hints,
-                                     std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
-    ConvConstants conv;
-    RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvConstants>(std::move(conv));
-  } else {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status SelectConvolutionWinogradAdreno(
+std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
-  ConvTexture conv;
-  RETURN_IF_ERROR(
-      CreateConvTextureWino4x4To6x6(creation_context, op_def, attr, &conv));
-  *ptr = absl::make_unique<ConvTexture>(std::move(conv));
-  return absl::OkStatus();
-}
-
-absl::Status SelectConvolutionDynamicWeightsAdreno(
-    const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
-    const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
-  ConvPowerVR conv;
-  RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-      creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
-  *weights_desc = conv.GetConvWeightsDescription();
-  *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-  return absl::OkStatus();
-}
-
-absl::Status SelectConvolutionNVidia(const Convolution2DAttributes& attr,
-                                     const BHWC& dst_shape,
-                                     const CreationContext& creation_context,
-                                     const OperationDef& op_def,
-                                     std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvConstantsSupported(*creation_context.device, op_def, attr)) {
-    ConvConstants conv;
-    RETURN_IF_ERROR(CreateConvConstants(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvConstants>(std::move(conv));
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
+  if (IsConvConstantsSupported(device_info, op_def, attr)) {
+    ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<ConvConstants>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(
-        CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+    return absl::make_unique<ConvTexture>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionPowerVR(const Convolution2DAttributes& attr,
-                                      const CreationContext& creation_context,
-                                      const OperationDef& op_def,
-                                      std::unique_ptr<GPUOperation>* ptr) {
-  ConvPowerVR conv;
-  RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
-  *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-  return absl::OkStatus();
+std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
+  ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
+  return absl::make_unique<ConvTexture>(std::move(conv));
 }
 
-absl::Status SelectConvolutionMali(const Convolution2DAttributes& attr,
-                                   const BHWC& dst_shape,
-                                   const CreationContext& creation_context,
-                                   const OperationDef& op_def,
-                                   std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
+    const Convolution2DAttributes& attr, const BHWC& weights_shape,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
+    const OperationDef& op_def, ModelHints hints,
+    ConvWeightsDescription* weights_desc) {
+  ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
+      device_info, op_def, attr, weights_shape, &dst_shape);
+  *weights_desc = conv.GetConvWeightsDescription();
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def) {
+  if (IsConvConstantsSupported(device_info, op_def, attr)) {
+    ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<ConvConstants>(std::move(conv));
+  } else {
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
+  }
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionPowerVR(
+    const Convolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionMali(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
       IsConvBuffer1x1Supported(op_def, attr)) {
-    ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(
-        CreateConvBuffer1x1(creation_context, op_def, attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+    ConvBuffer1x1 conv =
+        CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(
-        CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionWinogradMali(
+std::unique_ptr<GPUOperation> SelectConvolutionWinogradMali(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+    const DeviceInfo& device_info, const OperationDef& op_def) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def,
-                                                    attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+    ConvBuffer1x1 conv =
+        CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
-                                                  attr, &conv, &dst_shape));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvPowerVR conv =
+        CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionDynamicWeightsMali(
+std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsMali(
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
     const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
+    ConvWeightsDescription* weights_desc) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
       IsConvBuffer1x1Supported(op_def, weights_shape, attr)) {
-    ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(CreateConvBuffer1x1DynamicWeights(
-        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    ConvBuffer1x1 conv = CreateConvBuffer1x1DynamicWeights(
+        device_info, op_def, attr, weights_shape, &dst_shape);
     *weights_desc = conv.GetConvWeightsDescription();
-    *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+    return absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
+        device_info, op_def, attr, weights_shape, &dst_shape);
     *weights_desc = conv.GetConvWeightsDescription();
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
 }  // namespace
 
-absl::Status SelectConvolution(const Convolution2DAttributes& attr,
-                               const BHWC& dst_shape,
-                               const CreationContext& creation_context,
-                               const OperationDef& op_def, ModelHints hints,
-                               std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->info_;
+std::unique_ptr<GPUOperation> SelectConvolution(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
   if (device_info.IsAdreno()) {
-    return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
-                                     hints, ptr);
+    return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsIntel()) {
-    return SelectConvolutionPowerVR(attr, creation_context, op_def, ptr);
+    return SelectConvolutionPowerVR(attr, device_info, op_def);
   } else if (device_info.IsNvidia()) {
-    return SelectConvolutionNVidia(attr, dst_shape, creation_context, op_def,
-                                     ptr);
+    return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def);
   } else if (device_info.IsMali()) {
-    return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
-                                   ptr);
+    return SelectConvolutionMali(attr, dst_shape, device_info, op_def);
   } else {
-    return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
-                                     hints, ptr);
+    return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
   }
 }
 
-absl::Status SelectConvolutionForWinograd(
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->info_;
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints) {
   if (device_info.IsAdreno()) {
-    return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
-                                             op_def, hints, ptr);
+    return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
+                                           hints);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsNvidia() || device_info.IsIntel()) {
-    ConvPowerVR conv;
-      RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
-                                                    attr, &conv, &dst_shape));
-      *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-      return absl::OkStatus();
+    ConvPowerVR conv =
+        CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else if (device_info.IsMali()) {
-    return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
-                                           op_def, ptr);
+    return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def);
   } else {
-    return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
-                                             op_def, hints, ptr);
+    return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def,
+                                           hints);
   }
 }
 
-absl::Status SelectConvolutionWithDynamicWeights(
+std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
     const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc) {
-  const auto& device_info = creation_context.device->info_;
+    ConvWeightsDescription* weights_desc) {
   if (device_info.IsAdreno()) {
     return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape,
-                                                 creation_context, op_def,
-                                                 hints, ptr, weights_desc);
+                                                 device_info, op_def, hints,
+                                                 weights_desc);
   } else if (device_info.IsMali()) {
     return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape,
-                                               creation_context, op_def, hints,
-                                               ptr, weights_desc);
+                                               device_info, op_def, hints,
+                                               weights_desc);
   } else {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVRDynamicWeights(
-        creation_context, op_def, attr, weights_shape, &conv, &dst_shape));
+    ConvPowerVR conv = CreateConvPowerVRDynamicWeights(
+        device_info, op_def, attr, weights_shape, &dst_shape);
     *weights_desc = conv.GetConvWeightsDescription();
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
-    return absl::OkStatus();
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
 }
 
-absl::Status SelectConverterToConvWeights(
-    const ConvWeightsDescription& weights_desc,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
+    const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
+    ModelHints hints) {
   ConverterToConvWeights converter =
       ConverterToConvWeights(op_def, weights_desc);
-  *ptr = absl::make_unique<ConverterToConvWeights>(std::move(converter));
-  return absl::OkStatus();
+  return absl::make_unique<ConverterToConvWeights>(std::move(converter));
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
index 58be4b60ce6..f2bacab304c 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h
@@ -29,27 +29,25 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectConvolution(const Convolution2DAttributes& attr,
-                               const BHWC& dst_shape,
-                               const CreationContext& creation_context,
-                               const OperationDef& op_def, ModelHints hints,
-                               std::unique_ptr<GPUOperation>* ptr);
-
-absl::Status SelectConvolutionForWinograd(
+std::unique_ptr<GPUOperation> SelectConvolution(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints);
 
-absl::Status SelectConvolutionWithDynamicWeights(
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(
+    const Convolution2DAttributes& attr, const BHWC& dst_shape,
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    ModelHints hints);
+
+std::unique_ptr<GPUOperation> SelectConvolutionWithDynamicWeights(
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
-    const BHWC& dst_shape, const CreationContext& creation_context,
+    const BHWC& dst_shape, const DeviceInfo& device_info,
     const OperationDef& op_def, ModelHints hints,
-    std::unique_ptr<GPUOperation>* ptr, ConvWeightsDescription* weights_desc);
+    ConvWeightsDescription* weights_desc);
 
-absl::Status SelectConverterToConvWeights(
-    const ConvWeightsDescription& weights_desc,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    ModelHints hints, std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectConverterToConvWeights(
+    const ConvWeightsDescription& weights_desc, const OperationDef& op_def,
+    ModelHints hints);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
index c00d9392702..a2cad9de5e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
@@ -28,96 +28,71 @@ namespace gpu {
 namespace cl {
 namespace {
 
-absl::Status SelectConvolutionTransposedAdreno(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
-    ConvolutionTransposedThin conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
-  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
-                                                     attr)) {
-    ConvolutionTransposed3x3Thin conv;
-    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
-                                                       attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedAdreno(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  if (IsConvolutionTransposedThinSupported(attr)) {
+    ConvolutionTransposedThin conv =
+        CreateConvolutionTransposedThin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
+    ConvolutionTransposed3x3Thin conv =
+        CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
   } else {
-    ConvolutionTransposed conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+    ConvolutionTransposed conv =
+        CreateConvolutionTransposed(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionTransposedPowerVR(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  if (IsConvolutionTransposedThinSupported(*creation_context.device, attr)) {
-    ConvolutionTransposedThin conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposedThin(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
-  } else if (IsConvolutionTransposed3x3ThinSupported(*creation_context.device,
-                                                     attr)) {
-    ConvolutionTransposed3x3Thin conv;
-    RETURN_IF_ERROR(CreateConvolutionTransposed3x3Thin(creation_context, op_def,
-                                                       attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
-  } else if (IsConvolutionTransposed3x3Supported(*creation_context.device,
-                                                 op_def, attr)) {
-    ConvolutionTransposed3x3 conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed3x3(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
-  } else if (IsConvolutionTransposed4x4Supported(*creation_context.device,
-                                                 op_def, attr)) {
-    ConvolutionTransposed4x4 conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed4x4(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed4x4>(std::move(conv));
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedPowerVR(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  if (IsConvolutionTransposedThinSupported(attr)) {
+    ConvolutionTransposedThin conv =
+        CreateConvolutionTransposedThin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposedThin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3ThinSupported(attr)) {
+    ConvolutionTransposed3x3Thin conv =
+        CreateConvolutionTransposed3x3Thin(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed3x3Thin>(std::move(conv));
+  } else if (IsConvolutionTransposed3x3Supported(op_def, attr)) {
+    ConvolutionTransposed3x3 conv =
+        CreateConvolutionTransposed3x3(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed3x3>(std::move(conv));
+  } else if (IsConvolutionTransposed4x4Supported(op_def, attr)) {
+    ConvolutionTransposed4x4 conv =
+        CreateConvolutionTransposed4x4(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed4x4>(std::move(conv));
   } else {
-    ConvolutionTransposed conv;
-    RETURN_IF_ERROR(
-        CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
+    ConvolutionTransposed conv =
+        CreateConvolutionTransposed(device_info, op_def, attr);
+    return absl::make_unique<ConvolutionTransposed>(std::move(conv));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectConvolutionTransposedMali(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  ConvolutionTransposed conv;
-  RETURN_IF_ERROR(
-      CreateConvolutionTransposed(creation_context, op_def, attr, &conv));
-  *ptr = absl::make_unique<ConvolutionTransposed>(std::move(conv));
-  return absl::OkStatus();
+std::unique_ptr<GPUOperation> SelectConvolutionTransposedMali(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  ConvolutionTransposed conv =
+      CreateConvolutionTransposed(device_info, op_def, attr);
+  return absl::make_unique<ConvolutionTransposed>(std::move(conv));
 }
-
 }  // namespace
 
-absl::Status SelectConvolutionTransposed(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->info_;
+std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (device_info.IsAdreno()) {
-    return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
-                                             ptr);
+    return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsNvidia() || device_info.IsIntel()) {
-    return SelectConvolutionTransposedPowerVR(attr, creation_context, op_def,
-                                              ptr);
+    return SelectConvolutionTransposedPowerVR(attr, device_info, op_def);
   } else if (device_info.IsMali()) {
-    return SelectConvolutionTransposedMali(attr, creation_context, op_def, ptr);
+    return SelectConvolutionTransposedMali(attr, device_info, op_def);
   } else {
-    return SelectConvolutionTransposedAdreno(attr, creation_context, op_def,
-                                             ptr);
+    return SelectConvolutionTransposedAdreno(attr, device_info, op_def);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
index ff37c1024ad..fd241766eba 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h
@@ -26,10 +26,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectConvolutionTransposed(
-    const ConvolutionTransposedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectConvolutionTransposed(
+    const ConvolutionTransposedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
index 7373e3d545c..408fe7c47c8 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
@@ -28,7 +28,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectDefault(const CreationContext& creation_context,
+absl::Status SelectDefault(const DeviceInfo& device_info,
                            const OperationDef& op_def, ModelHints hints,
                            const std::vector<Value*>& inputs,
                            const std::vector<Value*>& outputs, const Node& node,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
index 34004240df4..790da1c80f9 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
@@ -29,7 +30,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectDefault(const CreationContext& creation_context,
+absl::Status SelectDefault(const DeviceInfo& device_info,
                            const OperationDef& op_def, ModelHints hints,
                            const std::vector<Value*>& inputs,
                            const std::vector<Value*>& outputs, const Node& node,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
index b89f271365f..2d61defe64b 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -26,79 +26,59 @@ namespace gpu {
 namespace cl {
 namespace {
 
-absl::Status SelectDWConvolutionAdreno(
-    const DepthwiseConvolution2DAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectDWConvolutionAdreno(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (IsDepthwiseConv3x3Supported(attr)) {
-    DepthwiseConv3x3 dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConv3x3>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConv3x3>(
+        CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    DepthwiseConvolution dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConvolution(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConvolution>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConvolution>(
+        CreateDepthwiseConvolution(device_info, op_def, attr));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectDWConvolutionPowerVR(
-    const DepthwiseConvolution2DAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectDWConvolutionPowerVR(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (IsDepthwiseConv3x3Supported(attr)) {
-    DepthwiseConv3x3 dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConv3x3>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConv3x3>(
+        CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    DepthwiseConvolution dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConvolution(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConvolution>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConvolution>(
+        CreateDepthwiseConvolution(device_info, op_def, attr));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectDWConvolutionMali(
-    const DepthwiseConvolution2DAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectDWConvolutionMali(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   const auto storage_type = op_def.src_tensors[0].storage_type;
   bool buffer_type = storage_type == TensorStorageType::BUFFER ||
                      storage_type == TensorStorageType::IMAGE_BUFFER;
-  MaliInfo mali_info = creation_context.device->info_.mali_info;
+  const MaliInfo mali_info = device_info.mali_info;
   if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() &&
       !buffer_type && op_def.precision != CalculationsPrecision::F32) {
-    DepthwiseConv3x3 dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConv3x3(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConv3x3>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConv3x3>(
+        CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    DepthwiseConvolution dw_conv;
-    RETURN_IF_ERROR(
-        CreateDepthwiseConvolution(creation_context, op_def, attr, &dw_conv));
-    *ptr = absl::make_unique<DepthwiseConvolution>(std::move(dw_conv));
+    return absl::make_unique<DepthwiseConvolution>(
+        CreateDepthwiseConvolution(device_info, op_def, attr));
   }
-  return absl::OkStatus();
 }
 }  // namespace
 
-absl::Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->info_;
+std::unique_ptr<GPUOperation> SelectDWConvolution(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
   if (device_info.IsAdreno()) {
-    return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionAdreno(attr, device_info, op_def);
   } else if (device_info.IsPowerVR()) {
-    return SelectDWConvolutionPowerVR(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionPowerVR(attr, device_info, op_def);
   } else if (device_info.IsMali()) {
-    return SelectDWConvolutionMali(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionMali(attr, device_info, op_def);
   } else {
-    return SelectDWConvolutionAdreno(attr, creation_context, op_def, ptr);
+    return SelectDWConvolutionAdreno(attr, device_info, op_def);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
index 7f7cc6da604..2147b9773e2 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h
@@ -26,10 +26,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectDWConvolution(const DepthwiseConvolution2DAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectDWConvolution(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 0df8e243da3..24c48d52f2a 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -27,97 +27,71 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectFullyConnectedGeneric(
-    const FullyConnectedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    int batch_size, std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+    return absl::make_unique<ConvTexture>(std::move(conv));
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
-                                        const CreationContext& creation_context,
-                                        const OperationDef& op_def,
-                                        int batch_size,
-                                        std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv;
-    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+    return absl::make_unique<ConvTexture>(std::move(conv));
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnectedPowerVR(
-    const FullyConnectedAttributes& attr,
-    const CreationContext& creation_context, const OperationDef& op_def,
-    int batch_size, std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedPowerVR(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
-    *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
-                                      const CreationContext& creation_context,
-                                      const OperationDef& op_def,
-                                      int batch_size,
-                                      std::unique_ptr<GPUOperation>* ptr) {
+std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
     if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-      ConvBuffer1x1 conv;
-      RETURN_IF_ERROR(
-          CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
-      *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
+      ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
+      return absl::make_unique<ConvBuffer1x1>(std::move(conv));
     } else {
-      ConvTexture conv;
-      RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
-      *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+      ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
+      return absl::make_unique<ConvTexture>(std::move(conv));
     }
   } else {
-    FullyConnected fc;
-    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
+    return absl::make_unique<FullyConnected>(std::move(fc));
   }
-  return absl::OkStatus();
 }
 
-absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def, int batch_size,
-                                  std::unique_ptr<GPUOperation>* ptr) {
-  const auto& device_info = creation_context.device->info_;
+std::unique_ptr<GPUOperation> SelectFullyConnected(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size) {
   if (device_info.IsAdreno()) {
-    return SelectFullyConnectedAdreno(attr, creation_context, op_def,
-                                      batch_size, ptr);
+    return SelectFullyConnectedAdreno(attr, device_info, op_def, batch_size);
   } else if (device_info.IsPowerVR() || device_info.IsAMD() ||
              device_info.IsNvidia() || device_info.IsIntel()) {
-    return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
-                                       batch_size, ptr);
+    return SelectFullyConnectedPowerVR(attr, device_info, op_def, batch_size);
   } else if (device_info.IsMali()) {
-    return SelectFullyConnectedMali(attr, creation_context, op_def, batch_size,
-                                    ptr);
+    return SelectFullyConnectedMali(attr, device_info, op_def, batch_size);
   } else {
-    return SelectFullyConnectedGeneric(attr, creation_context, op_def,
-                                       batch_size, ptr);
+    return SelectFullyConnectedGeneric(attr, device_info, op_def, batch_size);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
index 4ae44490996..197c243c5d5 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h
@@ -26,10 +26,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def, int batch_size,
-                                  std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectFullyConnected(
+    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def, int batch_size);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index b257e5a85da..98706a26cab 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -39,7 +39,7 @@ namespace gpu {
 namespace cl {
 namespace {
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
-                                   const CLDevice& device,
+                                   const DeviceInfo& device_info,
                                    const BHWC& dst_shape) {
   const int tiles_x = DivideRoundUp(dst_shape.w, 4);
   const int tiles_y = DivideRoundUp(dst_shape.h, 4);
@@ -49,23 +49,22 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
       attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
       attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
   // Mali among other devices has smaller SIMD line size
-  const int min_depth = device.IsMali() ? 16 : 32;
-  const int min_hw = device.IsMali() ? 32 : 128;
+  const int min_depth = device_info.IsMali() ? 16 : 32;
+  const int min_hw = device_info.IsMali() ? 32 : 128;
   const bool recommended_channels =
       dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
   const bool recommended_hw = tiles_x * tiles_y >= min_hw;
   return suitable_attributes && recommended_channels && recommended_hw;
 }
 
-absl::Status WinogradFromNode(const CreationContext& creation_context,
+absl::Status WinogradFromNode(const DeviceInfo& device_info,
                               const std::vector<Value*>& inputs,
                               const std::vector<Value*>& outputs,
                               const OperationDef& op_def, ModelHints hints,
                               const BHWC& input_shape, const BHWC& output_shape,
                               const Convolution2DAttributes& attr,
                               GPUOperationsSubgraph* gpu_subgraph) {
-  if (!IsSuitableForWinograd4x4To6x6(attr, *creation_context.device,
-                                     output_shape)) {
+  if (!IsSuitableForWinograd4x4To6x6(attr, device_info, output_shape)) {
     return absl::UnimplementedError("No implementation for this case.");
   }
 
@@ -75,16 +74,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   const BHWC shape_1{input_shape.b, 36, tiles_x * tiles_y, output_shape.c};
   TensorDescriptor td_0;
   td_0.storage_type = SelectBestStorageType(
-      creation_context.device->info_, shape_0,
-      op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
-      op_def.src_tensors[0].layout);
+      device_info, shape_0, op_def.src_tensors[0].storage_type,
+      op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
   td_0.data_type = op_def.src_tensors[0].data_type;
   td_0.layout = op_def.src_tensors[0].layout;
   TensorDescriptor td_1;
   td_1.storage_type = SelectBestStorageType(
-      creation_context.device->info_, shape_1,
-      op_def.src_tensors[0].storage_type, op_def.src_tensors[0].data_type,
-      op_def.src_tensors[0].layout);
+      device_info, shape_1, op_def.src_tensors[0].storage_type,
+      op_def.src_tensors[0].data_type, op_def.src_tensors[0].layout);
   td_1.data_type = op_def.src_tensors[0].data_type;
   td_1.layout = op_def.src_tensors[0].layout;
   gpu_subgraph->new_tensors = {{shape_0, td_0}, {shape_1, td_1}};
@@ -96,8 +93,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   winograd_up_def.src_tensors.push_back(op_def.src_tensors[0]);
   winograd_up_def.dst_tensors.push_back(td_0);
   auto& winograd_up = gpu_subgraph->operations[0];
-  RETURN_IF_ERROR(SelectWinograd4x4To36(
-      creation_context, attr.padding, winograd_up_def, &winograd_up.operation));
+  winograd_up.operation =
+      SelectWinograd4x4To36(device_info, attr.padding, winograd_up_def);
   winograd_up.input_ids = {static_cast<int>(inputs[0]->id)};
   winograd_up.output_ids = {-1};
 
@@ -108,8 +105,8 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
   auto& conv = gpu_subgraph->operations[1];
   conv.input_ids = {-1};
   conv.output_ids = {-2};
-  RETURN_IF_ERROR(SelectConvolutionForWinograd(
-      attr, input_shape, creation_context, conv_def, hints, &conv.operation));
+  conv.operation = SelectConvolutionForWinograd(attr, input_shape, device_info,
+                                                conv_def, hints);
 
   OperationDef winograd_down_def;
   winograd_down_def.precision = op_def.precision;
@@ -123,14 +120,14 @@ absl::Status WinogradFromNode(const CreationContext& creation_context,
     bias_copy.shape = Linear(attr.weights.shape.o);
     bias_copy.data.resize(attr.weights.shape.o);
   }
-  RETURN_IF_ERROR(SelectWinograd36To4x4(creation_context, winograd_down_def,
-                                        bias_copy, &winograd_down.operation));
+  winograd_down.operation =
+      SelectWinograd36To4x4(device_info, winograd_down_def, bias_copy);
   return absl::OkStatus();
 }
 
 }  // namespace
 
-absl::Status GPUOperationFromNode(const CreationContext& creation_context,
+absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
                                   const OperationDef& op_def, ModelHints hints,
                                   const std::vector<Value*>& inputs,
                                   const std::vector<Value*>& outputs,
@@ -159,9 +156,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        GPUOperation operation;
-        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
-                                          attr, &operation));
+        GPUOperation operation =
+            CreateElementwise(device_info, op_def, op_type, attr);
         *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       }
@@ -174,8 +170,7 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       for (int i = 0; i < inputs.size(); ++i) {
         channels[i] = inputs[i]->tensor.shape.c;
       }
-      return SelectConcat(attr, channels, op_def,
-                          creation_context.device->info_, gpu_op);
+      return SelectConcat(attr, channels, op_def, device_info, gpu_op);
     }
     case OperationType::CONVOLUTION_2D: {
       auto attr =
@@ -183,14 +178,15 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       auto input_shape = inputs[0]->tensor.shape;
       auto output_shape = outputs[0]->tensor.shape;
       if (inputs.size() == 1) {
-        if (WinogradFromNode(creation_context, inputs, outputs, op_def, hints,
+        if (WinogradFromNode(device_info, inputs, outputs, op_def, hints,
                              input_shape, output_shape, attr, gpu_subgraph)
                 .ok()) {
           return absl::OkStatus();
         } else {
           gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
-          return SelectConvolution(attr, output_shape, creation_context, op_def,
-                                   hints, gpu_op);
+          *gpu_op =
+              SelectConvolution(attr, output_shape, device_info, op_def, hints);
+          return absl::OkStatus();
         }
       } else {
         auto weights_shape = inputs[1]->tensor.shape;
@@ -206,9 +202,9 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
         OperationDef conv_def = op_def;
         conv_def.src_tensors[1] = weights_desc;
         ConvWeightsDescription conv_weights_desc;
-        RETURN_IF_ERROR(SelectConvolutionWithDynamicWeights(
-            attr, weights_shape, output_shape, creation_context, conv_def,
-            hints, &conv_op.operation, &conv_weights_desc));
+        conv_op.operation = SelectConvolutionWithDynamicWeights(
+            attr, weights_shape, output_shape, device_info, conv_def, hints,
+            &conv_weights_desc);
 
         int aligned_output =
             AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
@@ -225,30 +221,32 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
 
         converter_op.input_ids = {static_cast<int>(inputs[1]->id)};
         converter_op.output_ids = {-1};
-        return SelectConverterToConvWeights(conv_weights_desc, creation_context,
-                                            converter_def, hints,
-                                            &converter_op.operation);
+        converter_op.operation = SelectConverterToConvWeights(
+            conv_weights_desc, converter_def, hints);
+        return absl::OkStatus();
       }
     }
     case OperationType::CONVOLUTION_TRANSPOSED: {
       auto attr = absl::any_cast<ConvolutionTransposedAttributes>(
           node.operation.attributes);
-      return SelectConvolutionTransposed(attr, creation_context, op_def,
-                                         gpu_op);
+      *gpu_op = SelectConvolutionTransposed(attr, device_info, op_def);
+      return absl::OkStatus();
     }
     case OperationType::DEPTHWISE_CONVOLUTION: {
       auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
           node.operation.attributes);
-      return SelectDWConvolution(attr, creation_context, op_def, gpu_op);
+      *gpu_op = SelectDWConvolution(attr, device_info, op_def);
+      return absl::OkStatus();
     }
     case OperationType::FULLY_CONNECTED: {
       auto attr =
           absl::any_cast<FullyConnectedAttributes>(node.operation.attributes);
-      return SelectFullyConnected(attr, creation_context, op_def,
-                                  inputs[0]->tensor.shape.b, gpu_op);
+      *gpu_op = SelectFullyConnected(attr, device_info, op_def,
+                                     inputs[0]->tensor.shape.b);
+      return absl::OkStatus();
     }
     case OperationType::LSTM: {
-      SelectLSTM(op_def, creation_context.device->info_, gpu_op);
+      SelectLSTM(op_def, device_info, gpu_op);
       return absl::OkStatus();
     }
     case OperationType::MAX_UNPOOLING_2D: {
@@ -259,10 +257,11 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     }
     case OperationType::MEAN: {
       auto attr = absl::any_cast<MeanAttributes>(node.operation.attributes);
-      return SelectMean(attr, op_def, creation_context.device->info_, gpu_op);
+      return SelectMean(attr, op_def, device_info, gpu_op);
     }
     case OperationType::MEAN_STDDEV_NORMALIZATION: {
-      MeanStdDevNormalization operation = CreateMeanStdDevNormalization(op_def);
+      MeanStdDevNormalization operation =
+          CreateMeanStdDevNormalization(op_def, device_info);
       *gpu_op =
           absl::make_unique<MeanStdDevNormalization>(std::move(operation));
       return absl::OkStatus();
@@ -280,17 +279,18 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     }
     case OperationType::PRELU: {
       auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
-      return SelectPReLU(attr, creation_context, op_def, gpu_op);
+      *gpu_op = SelectPReLU(attr, device_info, op_def);
+      return absl::OkStatus();
     }
     case OperationType::QUANTIZE_AND_DEQUANTIZE: {
       auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
           node.operation.attributes);
-      SelectQuantizeAndDequantize(attr, creation_context, op_def, gpu_op);
+      *gpu_op = SelectQuantizeAndDequantize(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::RELU: {
       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
-      SelectReLU(creation_context, attr, op_def, gpu_op);
+      *gpu_op = SelectReLU(attr, op_def);
       return absl::OkStatus();
     }
     case OperationType::RESHAPE: {
@@ -356,9 +356,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
         auto attr =
             absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        GPUOperation operation;
-        RETURN_IF_ERROR(CreateElementwise(creation_context, op_def, op_type,
-                                          attr, &operation));
+        GPUOperation operation =
+            CreateElementwise(device_info, op_def, op_type, attr);
         *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       }
@@ -366,8 +365,8 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
           "No support of ", node.operation.type, " with this parameters"));
     }
     default:
-      return SelectDefault(creation_context, op_def, hints, inputs, outputs,
-                           node, gpu_subgraph);
+      return SelectDefault(device_info, op_def, hints, inputs, outputs, node,
+                           gpu_subgraph);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
index f237a385718..640432e0390 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
@@ -29,7 +29,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-absl::Status GPUOperationFromNode(const CreationContext& creation_context,
+absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
                                   const OperationDef& op_def, ModelHints hints,
                                   const std::vector<Value*>& inputs,
                                   const std::vector<Value*>& outputs,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index ca5ec9f4f23..4464342be16 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -51,21 +51,16 @@ void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
   *ptr = absl::make_unique<LSTM>(std::move(operation));
 }
 
-void SelectReLU(const CreationContext& creation_context,
-                const ReLUAttributes& attr, const OperationDef& op_def,
-                std::unique_ptr<GPUOperation>* ptr) {
-  GPUOperation relu = CreateReLU(creation_context, op_def, attr);
-  *ptr = absl::make_unique<GPUOperation>(std::move(relu));
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
+                                         const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(CreateReLU(op_def, attr));
 }
 
-absl::Status SelectPReLU(const PReLUAttributes& attr,
-                         const CreationContext& creation_context,
-                         const OperationDef& op_def,
-                         std::unique_ptr<GPUOperation>* ptr) {
-  GPUOperation operation;
-  RETURN_IF_ERROR(CreatePReLU(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
-  return absl::OkStatus();
+std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
+                                          const DeviceInfo& device_info,
+                                          const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(
+      CreatePReLU(device_info, op_def, attr));
 }
 
 void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
@@ -102,16 +97,16 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
                           std::unique_ptr<GPUOperation>* ptr) {
   switch (attr.axis) {
     case Axis::CHANNELS: {
-      ConcatZ operation = CreateConcatZ(op_def, channels, device_info);
-      *ptr = absl::make_unique<ConcatZ>(std::move(operation));
+      GPUOperation operation = CreateConcatZ(op_def, channels, device_info);
+      *ptr = absl::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     case Axis::BATCH:
     case Axis::DEPTH:
     case Axis::HEIGHT:
     case Axis::WIDTH: {
-      ConcatXY operation = CreateConcatXY(op_def, attr);
-      *ptr = absl::make_unique<ConcatXY>(std::move(operation));
+      GPUOperation operation = CreateConcatXY(op_def, attr);
+      *ptr = absl::make_unique<GPUOperation>(std::move(operation));
       return absl::OkStatus();
     }
     default:
@@ -123,25 +118,25 @@ void SelectReshape(int src_channels, int dst_channels,
                    const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
   if (src_channels % 4 == 0 && dst_channels % 4 == 0) {
-    Reshapex4 operation = CreateReshapex4(op_def);
-    *ptr = absl::make_unique<Reshapex4>(std::move(operation));
+    GPUOperation operation = CreateReshapex4(op_def);
+    *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   } else {
-    Reshape operation = CreateReshape(op_def);
-    *ptr = absl::make_unique<Reshape>(std::move(operation));
+    GPUOperation operation = CreateReshape(op_def);
+    *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   }
 }
 
 void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
                         const OperationDef& op_def,
                         std::unique_ptr<GPUOperation>* ptr) {
-  SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
-  *ptr = absl::make_unique<SpaceToDepth>(std::move(operation));
+  GPUOperation operation = CreateSpaceToDepth(op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
-  Padding operation = CreatePadding(op_def, attr);
-  *ptr = absl::make_unique<Padding>(std::move(operation));
+  GPUOperation operation = CreatePadding(op_def, attr);
+  *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
 void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
@@ -167,49 +162,38 @@ void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
     Softmax1x1 operation = CreateSoftmax1x1(op_def);
     *ptr = absl::make_unique<Softmax1x1>(std::move(operation));
   } else {
-    Softmax operation = CreateSoftmax(op_def);
-    *ptr = absl::make_unique<Softmax>(std::move(operation));
+    GPUOperation operation = CreateSoftmax(op_def);
+    *ptr = absl::make_unique<GPUOperation>(std::move(operation));
   }
 }
 
 void SelectTranspose(const TransposeAttributes& attr,
                      const OperationDef& op_def,
                      std::unique_ptr<GPUOperation>* ptr) {
-  Transpose operation = CreateTranspose(op_def, attr);
-  *ptr = absl::make_unique<Transpose>(std::move(operation));
-}
-
-absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
-                                   const Padding2D& padding,
-                                   const OperationDef& op_def,
-                                   std::unique_ptr<GPUOperation>* ptr) {
-  Winograd4x4To36 operation;
-  RETURN_IF_ERROR(
-      CreateWinograd4x4To36(creation_context, op_def, padding, &operation));
-  *ptr = absl::make_unique<Winograd4x4To36>(std::move(operation));
-  return absl::OkStatus();
-}
-
-absl::Status SelectWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& op_def,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    std::unique_ptr<GPUOperation>* ptr) {
-  Winograd36To4x4 operation;
-  RETURN_IF_ERROR(
-      CreateWinograd36To4x4(creation_context, op_def, biases, &operation));
-  *ptr = absl::make_unique<Winograd36To4x4>(std::move(operation));
-  return absl::OkStatus();
-}
-
-void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr) {
-  GPUOperation operation =
-      CreateQuantizeAndDequantize(creation_context, op_def, attr);
+  GPUOperation operation = CreateTranspose(op_def, attr);
   *ptr = absl::make_unique<GPUOperation>(std::move(operation));
 }
 
+std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
+    const DeviceInfo& device_info, const Padding2D& padding,
+    const OperationDef& op_def) {
+  return absl::make_unique<Winograd4x4To36>(
+      CreateWinograd4x4To36(device_info, op_def, padding));
+}
+
+std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases) {
+  return absl::make_unique<Winograd36To4x4>(
+      CreateWinograd36To4x4(device_info, op_def, biases));
+}
+
+std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
+    const QuantizeAndDequantizeAttributes& attr, const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(
+      CreateQuantizeAndDequantize(op_def, attr));
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index 556698ef62f..2a97e8aac08 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -31,14 +31,12 @@ namespace cl {
 void SelectLSTM(const OperationDef& op_def, const DeviceInfo& device_info,
                 std::unique_ptr<GPUOperation>* ptr);
 
-void SelectReLU(const CreationContext& creation_context,
-                const ReLUAttributes& attr, const OperationDef& op_def,
-                std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes& attr,
+                                         const OperationDef& op_def);
 
-absl::Status SelectPReLU(const PReLUAttributes& attr,
-                         const CreationContext& creation_context,
-                         const OperationDef& op_def,
-                         std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectPReLU(const PReLUAttributes& attr,
+                                          const DeviceInfo& device_info,
+                                          const OperationDef& op_def);
 
 void SelectPooling(const Pooling2DAttributes& attr, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr);
@@ -85,20 +83,16 @@ void SelectTranspose(const TransposeAttributes& attr,
                      const OperationDef& op_def,
                      std::unique_ptr<GPUOperation>* ptr);
 
-absl::Status SelectWinograd4x4To36(const CreationContext& creation_context,
-                                   const Padding2D& padding,
-                                   const OperationDef& op_def,
-                                   std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectWinograd4x4To36(
+    const DeviceInfo& device_info, const Padding2D& padding,
+    const OperationDef& op_def);
 
-absl::Status SelectWinograd36To4x4(
-    const CreationContext& creation_context, const OperationDef& op_def,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
-    std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectWinograd36To4x4(
+    const DeviceInfo& device_info, const OperationDef& op_def,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases);
 
-void SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr);
+std::unique_ptr<GPUOperation> SelectQuantizeAndDequantize(
+    const QuantizeAndDequantizeAttributes& attr, const OperationDef& op_def);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
index 8a801b460d1..31480f231b0 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
@@ -30,8 +30,8 @@ namespace gpu {
 namespace cl {
 namespace {
 absl::Status TryDepthwiseConvPlus1x1Conv(
-    const CreationContext& creation_context, CalculationsPrecision precision,
-    const GraphFloat32& graph, NodeId first_node_id,
+    CalculationsPrecision precision, const GraphFloat32& graph,
+    NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
   auto* dw_node = graph.GetNode(first_node_id);
@@ -71,16 +71,13 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
   if (it != tensor_descriptors.end()) {
     op_def.dst_tensors.push_back(it->second);
   }
-  if (!IsDepthwiseConvPlus1x1ConvSupported(*creation_context.device, op_def,
-                                           dw_attr, conv_attr)) {
+  if (!IsDepthwiseConvPlus1x1ConvSupported(op_def, dw_attr, conv_attr)) {
     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
   }
   std::unique_ptr<GPUOperation>* gpu_op =
       InitSingleOpSubgraph(dw_inputs, conv_outputs, gpu_subgraph);
-  DepthwiseConvPlus1x1Conv operation;
-  RETURN_IF_ERROR(CreateDepthwiseConvPlus1x1Conv(
-      creation_context, op_def, dw_attr, conv_attr, &operation));
-  *gpu_op = absl::make_unique<DepthwiseConvPlus1x1Conv>(std::move(operation));
+  auto operation = CreateDepthwiseConvPlus1x1Conv(op_def, dw_attr, conv_attr);
+  *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
   consumed_nodes->insert(dw_node->id);
   consumed_nodes->insert(conv_node->id);
   return absl::OkStatus();
@@ -88,18 +85,18 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
 }  // namespace
 
 absl::Status GPUSubgraphFromGraph(
-    const CreationContext& creation_context, CalculationsPrecision precision,
+    const DeviceInfo& device_info, CalculationsPrecision precision,
     const GraphFloat32& graph, NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
-  if (!creation_context.device->IsNvidia()) {
+  if (!device_info.IsNvidia()) {
     return absl::NotFoundError(
         "Experimental feature, enabled for NVidia only, but device is not "
         "nvidia gpu.");
   }
-  if (TryDepthwiseConvPlus1x1Conv(creation_context, precision, graph,
-                                  first_node_id, tensor_descriptors,
-                                  consumed_nodes, gpu_subgraph)
+  if (TryDepthwiseConvPlus1x1Conv(precision, graph, first_node_id,
+                                  tensor_descriptors, consumed_nodes,
+                                  gpu_subgraph)
           .ok()) {
     return absl::OkStatus();
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
index 687d221aac6..3ea99b2515a 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
@@ -31,7 +31,7 @@ namespace gpu {
 namespace cl {
 
 absl::Status GPUSubgraphFromGraph(
-    const CreationContext& creation_context, CalculationsPrecision precision,
+    const DeviceInfo& device_info, CalculationsPrecision precision,
     const GraphFloat32& graph, NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
     std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 4da3e5e5b63..72c53c5b1ac 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -28,6 +28,164 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
+                                  const TensorDescriptor& descriptor,
+                                  const void* data_ptr, CLMemory* result) {
+  const int slices = DivideRoundUp(shape.c, 4);
+  cl_mem_flags mem_flags = CL_MEM_READ_WRITE;
+  if (data_ptr) {
+    mem_flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  switch (descriptor.storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER: {
+      const size_t data_size = shape.b * shape.w * shape.h * shape.d * slices *
+                               4 * SizeOf(descriptor.data_type);
+      cl_int error_code;
+      cl_mem memory = clCreateBuffer(context.context(), mem_flags, data_size,
+                                     const_cast<void*>(data_ptr), &error_code);
+      if (!memory) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
+                         CLErrorCodeToString(error_code)));
+      }
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_2D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = shape.w * shape.b * shape.d;
+      desc.image_height = shape.h * slices;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_3D: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+      desc.image_width = shape.w * shape.b;
+      desc.image_height = shape.h;
+      desc.image_depth = slices * shape.d;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage3DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+    case TensorStorageType::TEXTURE_ARRAY: {
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+      desc.image_width = shape.w * shape.b;
+      desc.image_height = shape.h;
+      desc.image_depth = 0;
+      desc.image_array_size = slices * shape.d;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      format.image_channel_order = CL_RGBA;
+      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+      cl_int error_code;
+      cl_mem memory =
+          clCreateImage(context.context(), mem_flags, &format, &desc,
+                        const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+
+    case TensorStorageType::SINGLE_TEXTURE_2D: {
+      if (slices != 1) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
+            shape.c, "was provided"));
+      }
+      cl_image_desc desc;
+      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+      desc.image_width = shape.w * shape.b * shape.d;
+      desc.image_height = shape.h;
+      desc.image_depth = 0;
+      desc.image_row_pitch = 0;
+      desc.image_slice_pitch = 0;
+      desc.num_mip_levels = 0;
+      desc.num_samples = 0;
+      desc.buffer = nullptr;
+
+      cl_image_format format;
+      if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) {
+        format.image_channel_order = ToChannelOrder(shape.c);
+        format.image_channel_data_type =
+            ToImageChannelType(descriptor.data_type);
+      } else {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "This device doesn't support ", shape.c, "-channel textures."));
+      }
+
+      cl_int error_code;
+      cl_mem memory =
+          CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+                              const_cast<void*>(data_ptr), &error_code);
+      if (error_code != CL_SUCCESS) {
+        return absl::UnknownError(
+            absl::StrCat("Failed to create single 2D texture (clCreateImage): ",
+                         CLErrorCodeToString(error_code)));
+      }
+
+      *result = CLMemory(memory, true);
+      return absl::OkStatus();
+    }
+
+    default:
+      return absl::InternalError("Unsupported tensor storage type");
+  }
+}
 
 absl::Status CreateImageBufferFromBuffer(const CLContext& context,
                                          cl_mem memory, DataType data_type,
@@ -53,15 +211,14 @@ absl::Status CreateImageBufferFromBuffer(const CLContext& context,
   return absl::OkStatus();
 }
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWDC& shape,
+absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
                           const TensorDescriptor& descriptor, cl_mem memory,
                           Tensor* result) {
   const bool memory_owner = memory == nullptr;
   if (memory_owner) {
     CLMemory mem;
     RETURN_IF_ERROR(
-        AllocateTensorMemory(context, device, shape, descriptor, &mem));
+        AllocateTensorMemory(context, shape, descriptor, nullptr, &mem));
     memory = mem.Release();
   }
   if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) {
@@ -96,6 +253,14 @@ absl::Status CreateTensorShared(const CLContext& context, const BHWDC& shape,
 
 }  // namespace
 
+absl::Status TensorDescriptor::CreateGPUObject(CLContext* context,
+                                               GPUObjectPtr* result) const {
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*this, context));
+  *result = absl::make_unique<Tensor>(std::move(gpu_tensor));
+  return absl::OkStatus();
+}
+
 Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC& shape,
                const TensorDescriptor& descriptor)
     : memory_(memory),
@@ -281,12 +446,6 @@ absl::Status Tensor::IsValid(const BHWDC& shape) const {
   return absl::OkStatus();
 }
 
-int Tensor::GetChannelsAlignment() const {
-  return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D
-             ? shape_.c
-             : 4;
-}
-
 int Tensor::GetAlignedChannels() const {
   return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D
              ? shape_.c
@@ -331,11 +490,13 @@ absl::Status Tensor::WriteDataBHWDC(absl::Span<const float> in,
   if (descriptor_.data_type == DataType::FLOAT32) {
     data_f.resize(elements_count);
     data_ptr = data_f.data();
-    DataFromBHWDC(in, absl::MakeSpan(data_f.data(), data_f.size()));
+    DataFromBHWDC(in, shape_, descriptor_,
+                  absl::MakeSpan(data_f.data(), data_f.size()));
   } else {
     data_h.resize(elements_count);
     data_ptr = data_h.data();
-    DataFromBHWDC(in, absl::MakeSpan(data_h.data(), data_h.size()));
+    DataFromBHWDC(in, shape_, descriptor_,
+                  absl::MakeSpan(data_h.data(), data_h.size()));
   }
 
   switch (descriptor_.storage_type) {
@@ -415,9 +576,11 @@ absl::Status Tensor::ReadDataBHWDC(absl::Span<float> out,
   }
 
   if (descriptor_.data_type == DataType::FLOAT32) {
-    DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), out);
+    DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), shape_,
+                descriptor_, out);
   } else {
-    DataToBHWDC(absl::MakeConstSpan(data_h.data(), data_h.size()), out);
+    DataToBHWDC(absl::MakeConstSpan(data_h.data(), data_h.size()), shape_,
+                descriptor_, out);
   }
 
   return absl::OkStatus();
@@ -434,17 +597,35 @@ absl::Status Tensor::ReadData(CLCommandQueue* queue,
   return ReadDataBHWDC(absl::MakeSpan(dst->data), queue);
 }
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWC& shape, const TensorDescriptor& descriptor,
-                          Tensor* result) {
-  const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return CreateTensor(context, device, shape5D, descriptor, nullptr, result);
+absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor& desc,
+                                          CLContext* context) {
+  shape_ = desc.shape;
+  descriptor_.data_type = desc.data_type;
+  descriptor_.storage_type = desc.storage_type;
+  descriptor_.layout = desc.layout;
+  memory_owner_ = true;
+  CLMemory memory;
+  RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_,
+                                       desc.data.data(), &memory));
+  memory_ = memory.Release();
+  if (desc.storage_type == TensorStorageType::IMAGE_BUFFER) {
+    RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+        *context, memory_, desc.data_type,
+        shape_.b * shape_.w * shape_.h * shape_.d * DivideRoundUp(shape_.c, 4),
+        &image_buffer_memory_));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWDC& shape,
+absl::Status CreateTensor(const CLContext& context, const BHWC& shape,
                           const TensorDescriptor& descriptor, Tensor* result) {
-  return CreateTensor(context, device, shape, descriptor, nullptr, result);
+  const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+  return CreateTensor(context, shape5D, descriptor, nullptr, result);
+}
+
+absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
+                          const TensorDescriptor& descriptor, Tensor* result) {
+  return CreateTensor(context, shape, descriptor, nullptr, result);
 }
 
 absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
@@ -462,227 +643,19 @@ absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
   return CreateTensorShared(context, shape, descriptor, memory, result);
 }
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
   const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
-  return AllocateTensorMemory(context, device, shape5D, descriptor, result);
+  return AllocateTensorMemory(context, shape5D, descriptor, nullptr, result);
 }
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWDC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result) {
-  const int slices = DivideRoundUp(shape.c, 4);
-  switch (descriptor.storage_type) {
-    case TensorStorageType::BUFFER:
-    case TensorStorageType::IMAGE_BUFFER: {
-      const size_t data_size = shape.b * shape.w * shape.h * shape.d * slices *
-                               4 * SizeOf(descriptor.data_type);
-      cl_int error_code;
-      cl_mem memory = clCreateBuffer(context.context(), CL_MEM_READ_WRITE,
-                                     data_size, nullptr, &error_code);
-      if (!memory) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
-                         CLErrorCodeToString(error_code)));
-      }
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_2D: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-      desc.image_width = shape.w * shape.b * shape.d;
-      desc.image_height = shape.h * slices;
-      desc.image_depth = 0;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_3D: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
-      desc.image_width = shape.w * shape.b;
-      desc.image_height = shape.h;
-      desc.image_depth = slices * shape.d;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = CreateImage3DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 3D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-    case TensorStorageType::TEXTURE_ARRAY: {
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
-      desc.image_width = shape.w * shape.b;
-      desc.image_height = shape.h;
-      desc.image_depth = 0;
-      desc.image_array_size = slices * shape.d;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      format.image_channel_order = CL_RGBA;
-      format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
-
-      cl_int error_code;
-      cl_mem memory = clCreateImage(context.context(), CL_MEM_READ_WRITE,
-                                    &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture array (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-
-    case TensorStorageType::SINGLE_TEXTURE_2D: {
-      if (slices != 1) {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "SINGLE_TEXTURE_2D support only channels in range [1-4], but ",
-            shape.c, "was provided"));
-      }
-      cl_image_desc desc;
-      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-      desc.image_width = shape.w * shape.b * shape.d;
-      desc.image_height = shape.h;
-      desc.image_depth = 0;
-      desc.image_row_pitch = 0;
-      desc.image_slice_pitch = 0;
-      desc.num_mip_levels = 0;
-      desc.num_samples = 0;
-      desc.buffer = nullptr;
-
-      cl_image_format format;
-      if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) {
-        format.image_channel_order = ToChannelOrder(shape.c);
-        format.image_channel_data_type =
-            ToImageChannelType(descriptor.data_type);
-      } else {
-        return absl::InvalidArgumentError(absl::StrCat(
-            "This device doesn't support ", shape.c, "-channel textures."));
-      }
-
-      cl_int error_code;
-      cl_mem memory = CreateImage2DLegacy(context.context(), CL_MEM_READ_WRITE,
-                                          &format, &desc, nullptr, &error_code);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                         CLErrorCodeToString(error_code)));
-      }
-
-      *result = CLMemory(memory, true);
-      return absl::OkStatus();
-    }
-
-    default:
-      return absl::InternalError("Unsupported tensor storage type");
-  }
+  return AllocateTensorMemory(context, shape, descriptor, nullptr, result);
 }
 
-template <typename T>
-void Tensor::DataFromBHWDC(absl::Span<const float> src,
-                           absl::Span<T> dst) const {
-  const int channels_batch = GetChannelsAlignment();
-  for (int b = 0; b < shape_.b; ++b) {
-    for (int s = 0; s < Slices(); ++s) {
-      for (int y = 0; y < shape_.h; ++y) {
-        for (int x = 0; x < shape_.w; ++x) {
-          for (int d = 0; d < shape_.d; ++d) {
-            for (int c = 0; c < channels_batch; ++c) {
-              float value;
-              if (s * 4 + c < shape_.c) {
-                const int cpu_index =
-                    shape_.LinearIndex({b, y, x, d, s * 4 + c});
-                value = src[cpu_index];
-              } else {
-                value = 0.0f;
-              }
-              const int gpu_index = GetLinearIndex(b, x, y, d, s, c);
-              dst[gpu_index] = value;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void Tensor::DataFromBHWDC<float>(absl::Span<const float> src,
-                                           absl::Span<float> dst) const;
-template void Tensor::DataFromBHWDC<half>(absl::Span<const float> src,
-                                          absl::Span<half> dst) const;
-
-template <typename T>
-void Tensor::DataToBHWDC(absl::Span<const T> src, absl::Span<float> dst) const {
-  const int channels_batch = GetChannelsAlignment();
-  for (int b = 0; b < shape_.b; ++b) {
-    for (int s = 0; s < Slices(); ++s) {
-      for (int y = 0; y < shape_.h; ++y) {
-        for (int x = 0; x < shape_.w; ++x) {
-          for (int d = 0; d < shape_.d; ++d) {
-            for (int c = 0; c < channels_batch; ++c) {
-              if (s * 4 + c >= shape_.c) {
-                continue;
-              }
-              const int cpu_index = shape_.LinearIndex({b, y, x, d, s * 4 + c});
-              const int gpu_index = GetLinearIndex(b, x, y, d, s, c);
-              dst[cpu_index] = src[gpu_index];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template void Tensor::DataToBHWDC<float>(absl::Span<const float> src,
-                                         absl::Span<float> dst) const;
-template void Tensor::DataToBHWDC<half>(absl::Span<const half> src,
-                                        absl::Span<float> dst) const;
-
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index a204ae9418a..c6056dbbbec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -92,6 +92,9 @@ class Tensor : public GPUObject {
   absl::Status ReadData(CLCommandQueue* queue, TensorFloat32* dst) const;
   absl::Status ReadData(CLCommandQueue* queue, Tensor5DFloat32* dst) const;
 
+  absl::Status CreateFromDescriptor(const TensorDescriptor& desc,
+                                    CLContext* context);
+
  private:
   absl::Status IsValid(const BHWC& shape) const;
   absl::Status IsValid(const BHWDC& shape) const;
@@ -104,37 +107,6 @@ class Tensor : public GPUObject {
   absl::Status ReadDataBHWDC(absl::Span<float> out,
                              CLCommandQueue* queue) const;
 
-  template <typename T>
-  void DataFromBHWDC(absl::Span<const float> src, absl::Span<T> dst) const;
-  template <typename T>
-  void DataToBHWDC(absl::Span<const T> src, absl::Span<float> dst) const;
-
-  // TODO(sorokin) might be bad performance
-  int GetLinearIndex(int b, int x, int y, int d, int s, int sub_c) const {
-    switch (descriptor_.storage_type) {
-      case TensorStorageType::BUFFER:
-      case TensorStorageType::IMAGE_BUFFER:
-      case TensorStorageType::TEXTURE_ARRAY:
-      case TensorStorageType::TEXTURE_3D:
-        return ((((d * Slices() + s) * shape_.h + y) * shape_.w + x) *
-                    shape_.b +
-                b) *
-                   4 +
-               sub_c;  // DSHWBC4
-      case TensorStorageType::TEXTURE_2D:
-        return ((((y * Slices() + s) * shape_.w + x) * shape_.b + b) *
-                    shape_.d +
-                d) *
-                   4 +
-               sub_c;  // HSWBDC4
-      case TensorStorageType::SINGLE_TEXTURE_2D:
-        return (((y * shape_.w + x) * shape_.b + b) * shape_.d + d) * shape_.c +
-               sub_c;  // HWBDC
-      case TensorStorageType::UNKNOWN:
-        return -1;
-    }
-  }
-
   int3 GetFullTensorRegion() const;
   void Release();
 
@@ -147,22 +119,18 @@ class Tensor : public GPUObject {
 
 using TensorPtr = std::shared_ptr<Tensor>;
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result);
 
-absl::Status AllocateTensorMemory(const CLContext& context,
-                                  const CLDevice& device, const BHWDC& shape,
+absl::Status AllocateTensorMemory(const CLContext& context, const BHWDC& shape,
                                   const TensorDescriptor& descriptor,
                                   CLMemory* result);
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWC& shape, const TensorDescriptor& descriptor,
-                          Tensor* result);
+absl::Status CreateTensor(const CLContext& context, const BHWC& shape,
+                          const TensorDescriptor& descriptor, Tensor* result);
 
-absl::Status CreateTensor(const CLContext& context, const CLDevice& device,
-                          const BHWDC& shape,
+absl::Status CreateTensor(const CLContext& context, const BHWDC& shape,
                           const TensorDescriptor& descriptor, Tensor* result);
 
 absl::Status CreateSharedTensor(const CLContext& context, cl_mem memory,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_test.cc b/tensorflow/lite/delegates/gpu/cl/tensor_test.cc
index 99ba269cf60..d64de5f151b 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_test.cc
@@ -47,8 +47,7 @@ absl::Status TensorGenericTest(const BHWC& shape,
   }
 
   Tensor tensor;
-  RETURN_IF_ERROR(
-      CreateTensor(env->context(), env->device(), shape, descriptor, &tensor));
+  RETURN_IF_ERROR(CreateTensor(env->context(), shape, descriptor, &tensor));
   RETURN_IF_ERROR(tensor.WriteData(env->queue(), tensor_cpu));
   RETURN_IF_ERROR(tensor.ReadData(env->queue(), &tensor_gpu));
 
@@ -77,8 +76,7 @@ absl::Status Tensor5DGenericTest(const BHWDC& shape,
   }
 
   Tensor tensor;
-  RETURN_IF_ERROR(
-      CreateTensor(env->context(), env->device(), shape, descriptor, &tensor));
+  RETURN_IF_ERROR(CreateTensor(env->context(), shape, descriptor, &tensor));
   RETURN_IF_ERROR(tensor.WriteData(env->queue(), tensor_cpu));
   RETURN_IF_ERROR(tensor.ReadData(env->queue(), &tensor_gpu));
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index e19de02d59d..7bd5de6e31e 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -73,6 +73,25 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
+TensorDescriptor::TensorDescriptor(TensorDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      data_type(desc.data_type),
+      storage_type(desc.storage_type),
+      layout(desc.layout),
+      shape(desc.shape),
+      data(std::move(desc.data)) {}
+TensorDescriptor& TensorDescriptor::operator=(TensorDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(data_type, desc.data_type);
+    std::swap(storage_type, desc.storage_type);
+    std::swap(layout, desc.layout);
+    std::swap(shape, desc.shape);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
 GPUResources TensorDescriptor::GetGPUResources() const {
   GPUResources resources;
   if (HasAxis(Axis::WIDTH)) {
@@ -725,6 +744,134 @@ TextureAddressMode TensorDescriptor::ModeFromState() const {
   }
 }
 
+void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src) {
+  shape = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c);
+  UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+  shape = BHWDC(1, 1, 1, 1, src.shape.v);
+  UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(absl::Span<const float> src) {
+  int aligned_channels = storage_type == TensorStorageType::SINGLE_TEXTURE_2D
+                             ? shape.c
+                             : AlignByN(shape.c, 4);
+  int elements_count = shape.b * shape.w * shape.h * shape.d * aligned_channels;
+  data.resize(elements_count * SizeOf(data_type));
+  if (data_type == DataType::FLOAT32) {
+    float* gpu_data = reinterpret_cast<float*>(data.data());
+    DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+  } else {
+    half* gpu_data = reinterpret_cast<half*>(data.data());
+    DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+  }
+}
+
+namespace {
+int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b,
+                   int x, int y, int d, int s, int sub_c) {
+  const int slices = DivideRoundUp(shape.c, 4);
+  switch (desc.storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return ((((d * slices + s) * shape.h + y) * shape.w + x) * shape.b + b) *
+                 4 +
+             sub_c;  // DSHWBC4
+    case TensorStorageType::TEXTURE_2D:
+      return ((((y * slices + s) * shape.w + x) * shape.b + b) * shape.d + d) *
+                 4 +
+             sub_c;  // HSWBDC4
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return (((y * shape.w + x) * shape.b + b) * shape.d + d) * shape.c +
+             sub_c;  // HWBDC
+    case TensorStorageType::UNKNOWN:
+      return -1;
+  }
+}
+
+int GetChannelsAlignment(const TensorDescriptor& desc, const BHWDC& shape) {
+  return desc.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c
+                                                                   : 4;
+}
+}  // namespace
+
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC& shape,
+                   const TensorDescriptor& desc, absl::Span<T> dst) {
+  const int channels_alignment = GetChannelsAlignment(desc, shape);
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              float value;
+              if (s * 4 + c < shape.c) {
+                const int cpu_index =
+                    shape.LinearIndex({b, y, x, d, s * 4 + c});
+                value = src[cpu_index];
+              } else {
+                value = 0.0f;
+              }
+              int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+              dst[gpu_index] = value;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void DataFromBHWDC<float>(absl::Span<const float> src,
+                                   const BHWDC& shape,
+                                   const TensorDescriptor& desc,
+                                   absl::Span<float> dst);
+template void DataFromBHWDC<half>(absl::Span<const float> src,
+                                  const BHWDC& shape,
+                                  const TensorDescriptor& desc,
+                                  absl::Span<half> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC& shape,
+                 const TensorDescriptor& desc, absl::Span<float> dst) {
+  const int channels_alignment = GetChannelsAlignment(desc, shape);
+  const int slices = DivideRoundUp(shape.c, 4);
+  for (int b = 0; b < shape.b; ++b) {
+    for (int s = 0; s < slices; ++s) {
+      for (int y = 0; y < shape.h; ++y) {
+        for (int x = 0; x < shape.w; ++x) {
+          for (int d = 0; d < shape.d; ++d) {
+            for (int c = 0; c < channels_alignment; ++c) {
+              if (s * 4 + c >= shape.c) {
+                continue;
+              }
+              int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c});
+              int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+              dst[cpu_index] = src[gpu_index];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template void DataToBHWDC<float>(absl::Span<const float> src,
+                                 const BHWDC& shape,
+                                 const TensorDescriptor& desc,
+                                 absl::Span<float> dst);
+template void DataToBHWDC<half>(absl::Span<const half> src, const BHWDC& shape,
+                                const TensorDescriptor& desc,
+                                absl::Span<float> dst);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 73b15ca322d..094e3905966 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -49,6 +49,11 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
 
+  TensorDescriptor(const TensorDescriptor&) = default;
+  TensorDescriptor& operator=(const TensorDescriptor&) = default;
+  TensorDescriptor(TensorDescriptor&& desc);
+  TensorDescriptor& operator=(TensorDescriptor&& desc);
+
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
            layout == d.layout;
@@ -63,6 +68,10 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   GPUResources GetGPUResources() const override;
 
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override { data.clear(); }
+
   bool HasAxis(Axis axis) const;
   void SetTextureAddressMode(TextureAddressMode mode);
 
@@ -70,6 +79,9 @@ struct TensorDescriptor : public GPUObjectDescriptor {
       const std::vector<std::string>& args, std::string* value_name,
       std::string* x_coord, std::string* y_coord, std::string* s_coord) const;
 
+  void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
+  void UploadData(const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+
   DataType data_type = DataType::UNKNOWN;
   TensorStorageType storage_type = TensorStorageType::UNKNOWN;
   // This field describes logical layout, actual(physical) GPU layout can be
@@ -77,6 +89,10 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   Layout layout =
       Layout::UNKNOWN;  // Supported layouts is HWC, BHWC, HWDC, BHWDC
 
+  // optional
+  BHWDC shape;
+  std::vector<uint8_t> data;
+
  private:
   absl::Status PerformReadSelector(
       const std::vector<std::string>& args,
@@ -145,8 +161,18 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
                            std::string* xc, std::string* yc, std::string* zc,
                            std::string* sc, std::string* bc) const;
+
+  void UploadData(absl::Span<const float> src);
 };
 
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC& shape,
+                   const TensorDescriptor& desc, absl::Span<T> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC& shape,
+                 const TensorDescriptor& desc, absl::Span<float> dst);
+
 std::string ToString(TensorStorageType type);
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index 5edf64e83e7..28d26f03260 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -21,44 +21,38 @@ namespace cl {
 namespace {
 
 // Creates new 4-channel 2D texture with cl_channel_type elements
-absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
-                             void* data, CLContext* context,
-                             Texture2D* result) {
-  cl_image_desc desc;
-  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-  desc.image_width = width;
-  desc.image_height = height;
-  desc.image_depth = 0;
-  desc.image_row_pitch = 0;
-  desc.image_slice_pitch = 0;
-  desc.num_mip_levels = 0;
-  desc.num_samples = 0;
-  desc.buffer = nullptr;
-
-  cl_image_format format;
-  format.image_channel_order = CL_RGBA;
-  format.image_channel_data_type = type;
-
-  cl_mem_flags flags = CL_MEM_READ_WRITE;
-  if (data != nullptr) {
-    flags |= CL_MEM_COPY_HOST_PTR;
-  }
-
-  cl_int error_code;
-  cl_mem texture = CreateImage2DLegacy(context->context(), flags, &format,
-                                       &desc, data, &error_code);
-  if (error_code != CL_SUCCESS) {
-    return absl::UnknownError(
-        absl::StrCat("Failed to create 2D texture (clCreateImage): ",
-                     CLErrorCodeToString(error_code)));
-  }
-
-  *result = Texture2D(texture, width, height, type);
+absl::Status CreateTexture2D(int width, int height, DataType type, void* data,
+                             CLContext* context, Texture2D* result) {
+  cl_mem texture;
+  RETURN_IF_ERROR(CreateFloatRGBAImage2D(context->context(), width, height,
+                                         type, data, &texture));
+  cl_channel_type channel_type =
+      type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  *result = Texture2D(texture, width, height, channel_type);
 
   return absl::OkStatus();
 }
 }  // namespace
 
+Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor&& desc)
+    : GPUObjectDescriptor(std::move(desc)),
+      element_type(desc.element_type),
+      size(desc.size),
+      data(std::move(desc.data)) {}
+
+Texture2DDescriptor& Texture2DDescriptor::operator=(
+    Texture2DDescriptor&& desc) {
+  if (this != &desc) {
+    std::swap(element_type, desc.element_type);
+    std::swap(size, desc.size);
+    data = std::move(desc.data);
+    GPUObjectDescriptor::operator=(std::move(desc));
+  }
+  return *this;
+}
+
+void Texture2DDescriptor::Release() { data.clear(); }
+
 GPUResources Texture2DDescriptor::GetGPUResources() const {
   GPUResources resources;
   GPUImage2DDescriptor desc;
@@ -75,7 +69,7 @@ absl::Status Texture2DDescriptor::PerformSelector(
     return PerformReadSelector(args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
-        "TensorLinearDescriptor don't have selector with name - ", selector));
+        "Texture2DDescriptor don't have selector with name - ", selector));
   }
 }
 
@@ -93,6 +87,14 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
   return absl::OkStatus();
 }
 
+absl::Status Texture2DDescriptor::CreateGPUObject(CLContext* context,
+                                                  GPUObjectPtr* result) const {
+  Texture2D gpu_texture;
+  RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context));
+  *result = absl::make_unique<Texture2D>(std::move(gpu_texture));
+  return absl::OkStatus();
+}
+
 Texture2D::Texture2D(cl_mem texture, int width, int height,
                      cl_channel_type type)
     : texture_(texture), width_(width), height_(height), channel_type_(type) {}
@@ -139,37 +141,42 @@ absl::Status Texture2D::GetGPUResources(
   return absl::OkStatus();
 }
 
+absl::Status Texture2D::CreateFromTexture2DDescriptor(
+    const Texture2DDescriptor& desc, CLContext* context) {
+  width_ = desc.size.x;
+  height_ = desc.size.y;
+  channel_type_ =
+      desc.element_type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  return CreateFloatRGBAImage2D(context->context(), desc.size.x, desc.size.y,
+                                desc.element_type, data_ptr, &texture_);
+}
+
 // Creates new 4-channel 2D texture with f32 elements
 absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
                                     Texture2D* result) {
-  return CreateTexture2D(width, height, CL_FLOAT, nullptr, context, result);
+  return CreateTexture2D(width, height, DataType::FLOAT32, nullptr, context,
+                         result);
 }
 
 // Creates new 4-channel 2D texture with f16 elements
 absl::Status CreateTexture2DRGBA16F(int width, int height, CLContext* context,
                                     Texture2D* result) {
-  return CreateTexture2D(width, height, CL_HALF_FLOAT, nullptr, context,
+  return CreateTexture2D(width, height, DataType::FLOAT16, nullptr, context,
                          result);
 }
 
 absl::Status CreateTexture2DRGBA(DataType type, int width, int height,
                                  CLContext* context, Texture2D* result) {
-  if (type == DataType::FLOAT32) {
-    return CreateTexture2D(width, height, CL_FLOAT, nullptr, context, result);
-  } else {
-    return CreateTexture2D(width, height, CL_HALF_FLOAT, nullptr, context,
-                           result);
-  }
+  return CreateTexture2D(width, height, type, nullptr, context, result);
 }
 
 absl::Status CreateTexture2DRGBA(DataType type, int width, int height,
                                  void* data, CLContext* context,
                                  Texture2D* result) {
-  if (type == DataType::FLOAT32) {
-    return CreateTexture2D(width, height, CL_FLOAT, data, context, result);
-  } else {
-    return CreateTexture2D(width, height, CL_HALF_FLOAT, data, context, result);
-  }
+  return CreateTexture2D(width, height, type, data, context, result);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index 0e972de8cd3..51e0fc7e42c 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -34,6 +34,16 @@ namespace cl {
 struct Texture2DDescriptor : public GPUObjectDescriptor {
   DataType element_type;  // FLOAT32 or FLOAT16
 
+  // optional
+  int2 size = int2(0, 0);
+  std::vector<uint8_t> data;
+
+  Texture2DDescriptor() = default;
+  Texture2DDescriptor(const Texture2DDescriptor&) = default;
+  Texture2DDescriptor& operator=(const Texture2DDescriptor&) = default;
+  Texture2DDescriptor(Texture2DDescriptor&& desc);
+  Texture2DDescriptor& operator=(Texture2DDescriptor&& desc);
+
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
@@ -42,6 +52,10 @@ struct Texture2DDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources() const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
+
+  absl::Status CreateGPUObject(CLContext* context,
+                               GPUObjectPtr* result) const override;
+  void Release() override;
 };
 
 // Texture2D represent formatted GPU data storage.
@@ -73,6 +87,9 @@ class Texture2D : public GPUObject {
   absl::Status GetGPUResources(const GPUObjectDescriptor* obj_ptr,
                                GPUResourcesWithValue* resources) const override;
 
+  absl::Status CreateFromTexture2DDescriptor(const Texture2DDescriptor& desc,
+                                             CLContext* context);
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/util.cc b/tensorflow/lite/delegates/gpu/cl/util.cc
index ac996d8ffa6..199e0129968 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/util.cc
@@ -168,6 +168,56 @@ int ChannelTypeToSizeInBytes(cl_channel_type type) {
 
 bool OpenCLSupported() { return LoadOpenCL().ok(); }
 
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
+                            bool read_only, void* data, cl_mem* result) {
+  cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+  if (data) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+  cl_int error_code;
+  *result = clCreateBuffer(context, flags, size_in_bytes, data, &error_code);
+  if (!*result) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
+                                    DataType type, void* data, cl_mem* result) {
+  cl_image_desc desc;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = width;
+  desc.image_height = height;
+  desc.image_depth = 0;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = nullptr;
+
+  cl_image_format format;
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type =
+      type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+
+  cl_mem_flags flags = CL_MEM_READ_WRITE;
+  if (data) {
+    flags |= CL_MEM_COPY_HOST_PTR;
+  }
+
+  cl_int error_code;
+  *result =
+      CreateImage2DLegacy(context, flags, &format, &desc, data, &error_code);
+  if (error_code != CL_SUCCESS) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+                     CLErrorCodeToString(error_code)));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/util.h b/tensorflow/lite/delegates/gpu/cl/util.h
index 9435bb3a8a2..8e22c017fe7 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/util.h
@@ -49,6 +49,12 @@ void CopyLinearFLT4(const tflite::gpu::Tensor<Linear, S>& src,
   }
 }
 
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
+                            bool read_only, void* data, cl_mem* result);
+
+absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
+                                    DataType type, void* data, cl_mem* result);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index ad0d12763c1..98754024aa1 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1690,7 +1690,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinFullyConnected: {
-      ExpectMaxOpVersion(version, 4, &val_ctx);
+      ExpectMaxOpVersion(version, 5, &val_ctx);
       // TODO(b/132950584): Add support for FullyConnected with no bias.
       Expect(node->inputs->size == 3 &&
                  node->inputs->data[2] != kTfLiteOptionalTensor,
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/README.md b/tensorflow/lite/delegates/utils/dummy_delegate/README.md
index ae17f1b67d3..d55ba421cba 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/README.md
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/README.md
@@ -20,11 +20,11 @@ the ideas above. For more sophisticated examples, refer to [Flex delegate](https
 
 ## Testing & Tooling
 
-There are currently **two optionss** to plug in a newly created TFLite delegate
+There are currently **two options** to plug in a newly created TFLite delegate
 to reuse existing TFLite kernel tests and and tooling:
 
 - Utilize the **[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates)**
-mechansim
+mechanism
 - Utilize the
 **[external delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/external)**
 mechanism.
@@ -126,13 +126,13 @@ In this **alternative approach to reuse existing Tensorflow Lite kernel testing
 and tooling**, we first create an external delegate adaptor like the [`external_delegate_adaptor.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc) here, and create the corresponding BUILD target
 to build a dynamic library.
 
-Afterwards, one could build binaries or use pre-built ones that are linked with
-the
+Afterwards, one could build binaries or use pre-built ones to run with the
+dummy delegate as long as the binary is linked with the
 [`external_delegate_provider`](https://github.com/tensorflow/tensorflow/blob/8c6f2d55762f3fc94f98fdd8b3c5d59ee1276dba/tensorflow/lite/tools/delegates/BUILD#L145-L159)
 library which supports command-line flags as described
 [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates#external-delegate-provider).
-Note this delegate provider has already been linked to existing testing and
-tooling binaries.
+Note this external delegate provider has already been linked to existing testing
+and tooling binaries.
 
 For example, the following illustrates how to benchmark the dummy delegate here
 via this external-delegate approach. We could use similar commands for testing
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 559abc44a4d..6c5a32b0795 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_java_library")
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 
 package(
@@ -30,6 +30,12 @@ flatbuffer_cc_library(
 
 exports_files(srcs = ["database.fbs"])
 
+flatbuffer_java_library(
+    name = "database_fbs_java",
+    srcs = ["database.fbs"],
+    package_prefix = "org.tensorflow",
+)
+
 cc_library(
     name = "devicedb",
     srcs = [
@@ -118,6 +124,8 @@ cc_test(
     ],
 )
 
+exports_files(["gpu_compatibility.bin"])
+
 genrule(
     name = "gpu_compatibility_binary",
     srcs = ["gpu_compatibility.bin"],
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/database.fbs b/tensorflow/lite/experimental/acceleration/compatibility/database.fbs
index cf5aaa6d795..8c77718068a 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/database.fbs
+++ b/tensorflow/lite/experimental/acceleration/compatibility/database.fbs
@@ -28,15 +28,15 @@ enum Comparison : byte {
 // The structure describes a decision tree, with multiple matching branches.
 // The branches are applied depth-first.
 table DeviceDatabase {
-  root:[tflite.acceleration.DeviceDecisionTreeNode];
+  root:[DeviceDecisionTreeNode];
 }
 
 table DeviceDecisionTreeNode {
   // The variables are strings, as we have multiple clients that want to
   // introduce their own fields. Known variables are listed in variables.h.
   variable:string (shared);
-  comparison:tflite.acceleration.Comparison;
-  items:[tflite.acceleration.DeviceDecisionTreeEdge];
+  comparison:Comparison;
+  items:[DeviceDecisionTreeEdge];
 }
 
 table DeviceDecisionTreeEdge {
@@ -44,9 +44,9 @@ table DeviceDecisionTreeEdge {
   value:string (key, shared);
   // Which child branches should also be consulted and used to override this
   // node.
-  children:[tflite.acceleration.DeviceDecisionTreeNode];
+  children:[DeviceDecisionTreeNode];
   // What information can be derived about this device.
-  derived_properties:[tflite.acceleration.DerivedProperty];
+  derived_properties:[DerivedProperty];
 }
 
 // Derived variable value to combine with detected variables.
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json b/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json
index 61f9e1210f9..444b4b52d9b 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json
+++ b/tensorflow/lite/experimental/acceleration/compatibility/devicedb-sample.json
@@ -100,10 +100,6 @@
                             {
                               "variable": "tflite.gpu.status",
                               "value": "SUPPORTED"
-                            },
-                            {
-                              "variable": "tflite.gpu.opencl_status",
-                              "value": "SUPPORTED"
                             }
                           ]
                         }
@@ -150,7 +146,7 @@
                           "value": "j8y18lte",
                           "derived_properties": [
                             {
-                              "variable": "tflite.gpu.opencl_status",
+                              "variable": "tflite.gpu.status",
                               "value": "SUPPORTED"
                             }
                           ]
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
index c9c6ff831e5..5cd500c66af 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/devicedb_test.cc
@@ -115,7 +115,7 @@ TEST_F(DeviceDbTest, StatusLookupWithDevice) {
   variables[kDeviceModel] = "sm_j810m";
   variables[kDeviceName] = "j8y18lte";
   UpdateVariablesFromDatabase(&variables, *device_db_);
-  EXPECT_EQ(variables[gpu::kOpenCLStatus], gpu::kStatusSupported);
+  EXPECT_EQ(variables[gpu::kStatus], gpu::kStatusSupported);
 }
 
 TEST_F(DeviceDbTest, StatusLookupBasedOnDerivedProperties) {
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/variables.h b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
index 3904dbdb486..4e0b864c037 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/variables.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/variables.h
@@ -71,12 +71,10 @@ namespace gpu {
 // GPU-delegate derived properties.
 
 // Whether the GPU delegate works in general.
-// ("UNSET", "UNKNOWN", "SUPPORTED", "UNSUPPORTED").
+// Possible values are ("", "SUPPORTED", "UNSUPPORTED"). An empty value for
+// this field means that the device is unsupported.
 constexpr char kStatus[] = "tflite.gpu.status";
 
-// Whether OpenCL should be allowed. Possible values are the SupportStatus enums
-// ("UNSET", "UNKNOWN", "SUPPORTED", "UNSUPPORTED").
-constexpr char kOpenCLStatus[] = "tflite.gpu.opencl_status";
 constexpr char kStatusSupported[] = "SUPPORTED";
 constexpr char kStatusUnsupported[] = "UNSUPPORTED";
 }  // namespace gpu
diff --git a/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh b/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh
index 6841643173f..27253cdc511 100755
--- a/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh
+++ b/tensorflow/lite/experimental/ios/hide_symbols_with_allowlist.sh
@@ -33,8 +33,7 @@ LD_DEBUGGABLE_FLAGS="-x"
 # LD_DEBUGGABLE_FLAGS="-d"
 
 # Exits if C++ symbols are found in the allowlist.
-if grep -q "^__Z" "${ALLOWLIST_FILE_PATH}"
-then
+if grep -q "^__Z" "${ALLOWLIST_FILE_PATH}"; then
   echo "ERROR: Failed in symbol hiding. This rule does not permit hiding of" \
        "C++ symbols due to possible serious problems mixing symbol hiding," \
        "shared libraries and the C++ runtime." \
@@ -59,8 +58,7 @@ IFS=' ' read -r -a archs <<< "${archs_str}"
 merge_cmd=(xcrun lipo)
 
 # Merges object files and hide symbols for each architecture.
-for arch in "${archs[@]}"
-do
+for arch in "${archs[@]}"; do
     archdir=$(mktemp -t "${arch}" -d)
     arch_file="${archdir}/${arch}"
 
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 96ec7363ab1..097a11a77a4 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -47,10 +47,10 @@ upper_tabs:
 
       - heading: "Microcontrollers"
       - title: "Gesture recognition"
-        path: https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0
+        path: https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html
         status: external
       - title: "Hotword detection"
-        path: https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html
+        path: https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0
         status: external
       - title: "Person detection"
         path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection
@@ -76,7 +76,7 @@ upper_tabs:
         path: /lite/guide/roadmap
 
       - heading: "Convert a model"
-      - title: "TensorFlow Lite converter"
+      - title: "Overview"
         path: /lite/convert/
       - title: "Python API"
         path: /lite/convert/python_api
@@ -88,10 +88,10 @@ upper_tabs:
         path: /lite/convert/rnn
       - title: "Add metadata"
         path: /lite/convert/metadata
-      - title: "Composite operation fusion"
-        path: /lite/convert/operation_fusion
-      - title: "1.x compatibility"
-        path: /lite/convert/1x_compatibility
+      - title: "Sample models"
+        path: /lite/guide/hosted_models
+      - title: "API updates"
+        path: /lite/convert/api_updates
 
       - heading: "Create a model"
       - title: "TensorFlow Lite Model Maker"
@@ -101,27 +101,48 @@ upper_tabs:
       - heading: "Inference"
       - title: "Overview"
         path: /lite/guide/inference
-      - title: "Integrate models with metadata"
-        path: /lite/guide/codegen
-      - title: "Custom operators"
-        path: /lite/guide/ops_custom
-      - title: "Operator versions"
-        path: /lite/guide/ops_version
       - title: "Operator compatibility"
         path: /lite/guide/ops_compatibility
-      - title: "Select operators from TensorFlow"
+      - title: "Select operators"
         path: /lite/guide/ops_select
+      - title: "Custom operators"
+        path: /lite/guide/ops_custom
+      - title: "Fused operators"
+        path: /lite/convert/operation_fusion
+      - title: "Operator versions"
+        path: /lite/guide/ops_version
         status: experimental
-      - title: "Process input and output data"
-        path: /lite/guide/lite_support
-      - title: "List of hosted models"
-        path: /lite/guide/hosted_models
+
+      - heading: "Inference with metadata"
+      - title: "Overview"
+        path: /lite/inference_with_metadata/overview
+      - title: "Generate model interfaces with codegen"
+        path: /lite/inference_with_metadata/codegen
+      - title: "Integrate models with Task Library"
+        path: /lite/inference_with_metadata/task_library/overview
+        section:
+        - title: "ImageClassifier"
+          path: /lite/inference_with_metadata/task_library/image_classifier
+        - title: "ObjectDetector"
+          path: /lite/inference_with_metadata/task_library/object_detector
+        - title: "ImageSegmenter"
+          path: /lite/inference_with_metadata/task_library/image_segmenter
+        - title: "NLClassifier"
+          path: /lite/inference_with_metadata/task_library/nl_classifier
+        - title: "BertNLClassifier"
+          path: /lite/inference_with_metadata/task_library/bert_nl_classifier
+        - title: "BertQuestionAnswerer"
+          path: /lite/inference_with_metadata/task_library/bert_question_answerer
+        - title: "Customized API"
+          path: /lite/inference_with_metadata/task_library/customized_task_api
+      - title: "Customize input and output data processing"
+        path: /lite/inference_with_metadata/lite_support
 
       - heading: "Performance"
       - title: "Best practices"
         path: /lite/performance/best_practices
-      - title: "Benchmarks"
-        path: /lite/performance/benchmarks
+      - title: "Measurement"
+        path: /lite/performance/measurement
       - title: "Delegates"
         path: /lite/performance/delegates
         status: experimental
@@ -153,12 +174,17 @@ upper_tabs:
         path: /lite/performance/quantization_spec
 
       - heading: "Build TensorFlow Lite"
+      - title: "Build for Android"
+        path: /lite/guide/build_android
       - title: "Build for iOS"
         path: /lite/guide/build_ios
       - title: "Build for ARM64"
         path: /lite/guide/build_arm64
       - title: "Build for Raspberry Pi"
         path: /lite/guide/build_rpi
+      - title: "Reduce binary size"
+        path: /lite/guide/reduce_binary_size
+        status: experimental
 
       - heading: "Microcontrollers"
       - title: "Overview"
@@ -204,7 +230,7 @@ upper_tabs:
     - name: "API"
       skip_translation: true
       contents:
-      - title: API Reference
+      - title: "API Reference"
         path: /lite/api_docs/
       - heading: "Python"
       - title: "Overview"
@@ -213,7 +239,7 @@ upper_tabs:
       - heading: "Android (Java)"
       - include: /lite/api_docs/java/_toc.yaml
       - heading: "C++"
-      - title: Overview
+      - title: "Overview"
         path: /lite/api_docs/cc/
       - include: /lite/api_docs/cc/_doxygen.yaml
 
diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
deleted file mode 100644
index ceb99bad5e2..00000000000
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# TensorFlow 1.x Compatibility <a name="differences"></a>
-
-The `tf.lite.TFLiteConverter` Python API was updated between TensorFlow 1.x and
-2.x. This document explains the differences between the two versions, and
-provides information about how to use the 1.x version if required.
-
-If any of the changes raise concerns, please file a
-[GitHub Issue](https://github.com/tensorflow/tensorflow/issues).
-
-Note: We highly recommend that you
-[migrate your TensorFlow 1.x code to TensorFlow 2.x code](https://www.tensorflow.org/guide/migrate)
-.
-
-## Model formats
-
-#### SavedModel and Keras
-
-The `tf.lite.TFLiteConverter` API supports SavedModel and Keras HDF5 files
-generated in both TensorFlow 1.x and 2.x.
-
-#### Frozen Graph
-
-Note: TensorFlow 2.x no longer supports the generation of frozen graph models.
-
-The `tf.compat.v1.lite.TFLiteConverter` API supports frozen graph models
-generated in TensorFlow 1.x, as shown below:
-
-```python
-import tensorflow as tf
-# Path to the frozen graph file
-graph_def_file = 'frozen_graph.pb'
-# A list of the names of the model's input tensors
-input_arrays = ['input_name']
-# A list of the names of the model's output tensors
-output_arrays = ['output_name']
-# Load and convert the frozen graph
-converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
-  graph_def_file, input_arrays, output_arrays)
-tflite_model = converter.convert()
-# Write the converted model to disk
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-## Converter attributes
-
-#### Renamed attributes
-
-The following 1.x attribute has been renamed in 2.x.
-
-*   `target_ops` has been renamed to `target_spec.supported_ops` - In 2.x, in
-    line with future additions to the optimization framework, it has become an
-    attribute of `TargetSpec` and has been renamed to `supported_ops`.
-
-#### Unsupported attributes
-
-The following 1.x attributes have been removed in 2.x.
-
-*   _Quantization_ - In 2.x,
-    [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
-    is supported through the Keras API and
-    [post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
-    uses fewer streamlined converter flags. Thus, the following attributes and
-    methods related to quantization have been removed:
-    *   `inference_type`
-    *   `quantized_input_stats`
-    *   `post_training_quantize`
-    *   `default_ranges_stats`
-    *   `reorder_across_fake_quant`
-    *   `change_concat_input_ranges`
-    *   `get_input_arrays()`
-*   _Visualization_ - In 2.x, the recommended approach for visualizing a
-    TensorFlow Lite graph is to use
-    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
-    . Unlike GraphViz, it enables users to visualize the graph after post
-    training quantization has occurred. Thus, the following attributes related
-    to graph visualization have been removed:
-    *   `output_format`
-    *   `dump_graphviz_dir`
-    *   `dump_graphviz_video`
-*   _Frozen graph_ - In 2.x, the frozen graph model format has been removed.
-    Thus, the following attribute related to frozen graphs has been removed:
-    *   `drop_control_dependency`
-
-## Unsupported APIs
-
-The following section explains several significant features in 1.x that have
-been removed in 2.x.
-
-#### Conversion APIs
-
-The following methods were deprecated in 1.x and have been removed in 2.x:
-
-*   `lite.toco_convert`
-*   `lite.TocoConverter`
-
-#### `lite.constants` API
-
-The `lite.constants` API was removed in 2.x in order to decrease duplication
-between TensorFlow and TensorFlow Lite. The following list maps the
-`lite.constant` type to the TensorFlow type:
-
-*   `lite.constants.FLOAT`: `tf.float32`
-*   `lite.constants.INT8`: `tf.int8`
-*   `lite.constants.INT32`: `tf.int32`
-*   `lite.constants.INT64`: `tf.int64`
-*   `lite.constants.STRING`: `tf.string`
-*   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
-
-Additionally, the deprecation of the `output_format` flag in `TFLiteConverter`
-led to the removal of the following constants:
-
-*   `lite.constants.TFLITE`
-*   `lite.constants.GRAPHVIZ_DOT`
-
-#### `lite.OpHint` API
-
-The `OpHint` API is currently unsupported due to an incompatibility with the 2.x
-APIs. This API enables conversion of LSTM based models. Support for LSTMs in 2.x
-is being investigated. All related `lite.experimental` APIs have been removed
-due to this issue.
diff --git a/tensorflow/lite/g3doc/convert/api_updates.md b/tensorflow/lite/g3doc/convert/api_updates.md
new file mode 100644
index 00000000000..a990b4f76db
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/api_updates.md
@@ -0,0 +1,48 @@
+# API Updates <a name="api_updates"></a>
+
+This page provides information about updates made to the
+`tf.lite.TFLiteConverter` [Python API](index.md) in TensorFlow 2.x.
+
+Note: If any of the changes raise concerns, please file a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md).
+
+*   TensorFlow 2.3
+
+    *   Support integer (previously, only float) input/output type for integer
+        quantized models using the new `inference_input_type` and
+        `inference_output_type` attributes. Refer to this
+        [example usage](../performance/post_training_quantization.md#integer_only).
+    *   Support conversion and resizing of models with dynamic dimensions.
+    *   Added a new experimental quantization mode with 16-bit activations and
+        8-bit weights.
+
+*   TensorFlow 2.2
+
+    *   By default, leverage [MLIR-based conversion](https://mlir.llvm.org/),
+        Google's cutting edge compiler technology for machine learning. This
+        enables conversion of new classes of models, including Mask R-CNN,
+        Mobile BERT, etc and supports models with functional control flow.
+
+*   TensorFlow 2.0 vs TensorFlow 1.x
+
+    *   Renamed the `target_ops` attribute to `target_spec.supported_ops`
+    *   Removed the following attributes:
+        *   _quantization_: `inference_type`, `quantized_input_stats`,
+            `post_training_quantize`, `default_ranges_stats`,
+            `reorder_across_fake_quant`, `change_concat_input_ranges`,
+            `get_input_arrays()`. Instead,
+            [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+            is supported through the `tf.keras` API and
+            [post training quantization](../performance/post_training_quantization.md)
+            uses fewer attributes.
+        *   _visualization_: `output_format`, `dump_graphviz_dir`,
+            `dump_graphviz_video`. Instead, the recommended approach for
+            visualizing a TensorFlow Lite model is to use
+            [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
+        *   _frozen graphs_: `drop_control_dependency`, as frozen graphs are
+            unsupported in TensorFlow 2.x.
+    *   Removed other converter APIs such as `tf.lite.toco_convert` and
+        `tf.lite.TocoConverter`
+    *   Removed other related APIs such as `tf.lite.OpHint` and
+        `tf.lite.constants` (the `tf.lite.constants.*` types have been mapped to
+        `tf.*` TensorFlow data types, to reduce duplication)
diff --git a/tensorflow/lite/g3doc/convert/cmdline.md b/tensorflow/lite/g3doc/convert/cmdline.md
index 64d3e315b97..7ad94f804c5 100644
--- a/tensorflow/lite/g3doc/convert/cmdline.md
+++ b/tensorflow/lite/g3doc/convert/cmdline.md
@@ -13,10 +13,15 @@ GitHub
 ## High-level overview
 
 The TensorFlow Lite Converter has a command line tool named `tflite_convert`,
-which supports basic models. Use the [Python API](python_api.md) for any
-conversions involving optimizations, or any additional parameters (e.g.
-signatures in [SavedModels](https://www.tensorflow.org/guide/saved_model) or
-custom objects in
+which supports models saved in the supported file formats:
+
+*   [SavedModel directory](https://www.tensorflow.org/guide/saved_model)
+    generated in 1.X or 2.X.
+*   [`tf.keras` model](https://www.tensorflow.org/guide/keras/overview)
+    formatted in the HDF5 file.
+
+Use the [Python API](python_api.md) for any conversions involving optimizations,
+or any additional parameters (e.g. custom objects in
 [Keras models](https://www.tensorflow.org/guide/keras/overview)).
 
 ## Usage
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 71b5fd71737..27913fc8ba0 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -1,8 +1,7 @@
 # TensorFlow Lite converter
 
 The TensorFlow Lite converter takes a TensorFlow model and generates a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/) file
-(`.tflite`). The converter supports
+TensorFlow Lite model file (`.tflite`). The converter supports
 [SavedModel directories](https://www.tensorflow.org/guide/saved_model),
 [`tf.keras` models](https://www.tensorflow.org/guide/keras/overview), and
 [concrete functions](https://tensorflow.org/guide/concrete_function).
@@ -11,10 +10,33 @@ Note: This page contains documentation on the converter API for TensorFlow 2.0.
 The API for TensorFlow 1.X is available
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/index.md).
 
-## New in TF 2.2
+## Converting models
 
-TensorFlow Lite has switched to use a new converter backend by default - in the
-nightly builds and TF 2.2 stable. Why we did we switch?
+In TensorFlow Lite, there are two ways to create a TensorFlow Lite model file:
+
+*   [Python API](python_api.md) (recommended): The Python API makes it easier to
+    convert models as part of a model development pipeline and helps mitigate
+    [compatibility](../guide/ops_compatibility.md) issues early on.
+*   [Command line tool](cmdline.md): The CLI tool supports converting the models
+    saved in the supported file formats, the directory containing the SavedModel
+    and the HDF5 file containing the
+    [`tf.keras` model](https://www.tensorflow.org/guide/keras/overview).
+
+## Device deployment
+
+The TensorFlow Lite model is formatted in
+[`FlatBuffer`](https://google.github.io/flatbuffers/). After conversion, The
+model file is then deployed to a client device (e.g. mobile, embedded) and run
+locally using the TensorFlow Lite interpreter. This conversion process is shown
+in the diagram below:
+
+![TFLite converter workflow](../images/convert/workflow.svg)
+
+## MLIR-based conversion
+
+TensorFlow Lite has switched to use a new converter backend, based on MLIR, by
+default since TF 2.2 version. The new converter backend provides the following
+benefits:
 
 *   Enables conversion of new classes of models, including Mask R-CNN, Mobile
     BERT, and many more
@@ -28,41 +50,17 @@ nightly builds and TF 2.2 stable. Why we did we switch?
     dimensions
 *   Supports all existing converter functionality
 
-In case you encounter any issues:
+## Getting Help
+
+To get help with issues you may encounter using the TensorFlow Lite converter:
 
 *   Please create a
     [GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md)
-    with the component label “TFLiteConverter.” Please include:
-    *   Command used to run the converter or code if you’re using the Python API
-    *   The output from the converter invocation
-    *   The input model to the converter
-    *   If the conversion is successful, but the generated model is wrong, state
-        what is wrong:
-        *   Producing wrong results and / or decrease in accuracy
-        *   Producing correct results, but the model is slower than expected
-            (model generated from old converter)
-*   If you are using the allow_custom_ops feature, please read the
-    [Python API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/python_api.md)
-    and
-    [Command Line Tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/cmdline.md)
-    documentation
+    with the component label “TFLiteConverter”.
+*   If you are using the `allow_custom_ops` feature, please read the
+    [Python API](../convert/python_api.md) and
+    [Command Line Tool](../convert/cmdline.md) documentation
 *   Switch to the old converter by setting `--experimental_new_converter=false`
-    (from the [tflite_convert](https://www.tensorflow.org/lite/convert/cmdline)
-    command line tool) or `converter.experimental_new_converter=False` (from the
+    (from the [tflite_convert](../convert/cmdline.md) command line tool) or
+    `converter.experimental_new_converter=False` (from the
     [Python API](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter))
-
-## Device deployment
-
-The TensorFlow Lite `FlatBuffer` file is then deployed to a client device (e.g.
-mobile, embedded) and run locally using the TensorFlow Lite interpreter. This
-conversion process is shown in the diagram below:
-
-![TFLite converter workflow](../images/convert/workflow.svg)
-
-## Converting models
-
-The TensorFlow Lite converter should be used from the
-[Python API](python_api.md). Using the Python API makes it easier to convert
-models as part of a model development pipeline and helps mitigate
-[compatibility](../guide/ops_compatibility.md) issues early on. Alternatively,
-the [command line tool](cmdline.md) supports basic models.
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 4279e409416..089203fd9aa 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -7,9 +7,9 @@ input / output information. The metadata consists of both
 *   human readable parts which convey the best practice when using the model,
     and
 *   machine readable parts that can be leveraged by code generators, such as the
-    [TensorFlow Lite Android code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+    [TensorFlow Lite Android code generator](../inference_with_metadata/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
     and the
-    [Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding).
+    [Android Studio ML Binding feature](../inference_with_metadata/codegen.md#generate-code-with-android-studio-ml-model-binding).
 
 All image models published on
 [TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
@@ -47,9 +47,9 @@ There are three parts to the model metadata in the
     [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L599).
 
 Since TensorFlow Lite only supports single subgraph at this point, the
-[TensorFlow Lite code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+[TensorFlow Lite code generator](../inference_with_metadata/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
 and the
-[Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding)
+[Android Studio ML Binding feature](../inference_with_metadata/codegen.md#generate-code-with-android-studio-ml-model-binding)
 will use `ModelMetadata.name` and `ModelMetadata.description`, instead of
 `SubGraphMetadata.name` and `SubGraphMetadata.description`, when displaying
 metadata and generating code.
@@ -82,11 +82,11 @@ is compatible with existing TFLite framework and Interpreter. See
 [Pack mtadata and associated files into the model](#pack-metadata-and-associated-files-into-the-model)
 for more details.
 
-The associate file information can be recored in the metadata. Depending on the
+The associated file information can be recored in the metadata. Depending on the
 file type and where the file is attached to (i.e. `ModelMetadata`,
 `SubGraphMetadata`, and `TensorMetadata`),
-[the TensorFlow Lite Android code generator](../guide/codegen.md) may apply
-corresponding pre/post processing automatically to the object. See
+[the TensorFlow Lite Android code generator](../inference_with_metadata/codegen.md)
+may apply corresponding pre/post processing automatically to the object. See
 [the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L77-L127)
 in the schema for more details.
 
@@ -161,8 +161,7 @@ are two independent steps. Here are the details.
 and the
 [TensorFlow Lite C++ API](https://github.com/tensorflow/tensorflow/blob/09ec15539eece57b257ce9074918282d88523d56/tensorflow/lite/c/common.h#L391).
 \
-[2] The
-[metadata extractor library](../guide/codegen.md#read-the-metadata-from-models)
+[2] The [metadata extractor library](#read-the-metadata-from-models)
 
 When processing image data for uint8 models, normalization and quantization are
 sometimes skipped. It is fine to do so when the pixel values are in the range of
@@ -348,6 +347,9 @@ with open(export_json_file, "w") as f:
   f.write(json_file)
 ```
 
+Android Studio also supports displaying metadata through the
+[Android Studio ML Binding feature](https://developer.android.com/studio/preview/features#tensor-flow-lite-models).
+
 ## Metadata versioning
 
 The
@@ -391,5 +393,80 @@ largest version number among the versions of all the fields populated and the
 smallest compatible version indicated by the file identifier. The minimum
 necessary metadata parser version is automatically populated by the
 `MetadataPopulator` when the metadata is populated into a TFLite model. See the
-[metadata extractor](../guide/codegen.md#read-the-metadata-from-models) about
-how the minimum necessary metadata parser version is used.
+[metadata extractor](#read-the-metadata-from-models) for more information on how
+the minimum necessary metadata parser version is used.
+
+## Read the metadata from models
+
+The Metadata Extractor library is convenient tool to read the metadata and
+associated files from a models across different platforms (see the
+[Java version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata/java)
+and the
+[C++ version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata/cc)).
+You can build your own metadata extractor tool in other languages using the
+Flatbuffers library.
+
+### Read the metadata in Java
+
+Note: the Java Metadata Extractor library is available as an Android library
+dependency: `org.tensorflow:tensorflow-lite-metadata`.
+
+You can initialize a `MetadataExtractor` object with a `ByteBuffer` that points
+to the model:
+
+```java
+public MetadataExtractor(ByteBuffer buffer);
+```
+
+The `ByteBuffer` must remain unchanged for the entire lifetime of the
+`MetadataExtractor` object. The initialization may fail if the Flatbuffers file
+identifier of the model metadata does not match that of the metadata parser. See
+[metadata versioning](#metadata-versioning) for more information.
+
+With matching file identifiers, the metadata extractor will successfully read
+metadata generated from all past and future schema due to the Flatbuffers'
+forwards and backwards compatibility mechanism. However, fields from future
+schemas cannot be extracted by older metadata extractors. The
+[minimum necessary parser version](#the-minimum-necessary-metadata-parser-version)
+of the metadata indicates the minimum version of metadata parser that can read
+the metadata Flatbuffers in full. You can use the following method to verify if
+the minimum necessary parser version condition is met:
+
+```java
+public final boolean isMinimumParserVersionSatisfied();
+```
+
+Passing in a model without metadata is allowed. However, invoking methods that
+read from the metadata will cause runtime errors. You can check if a model has
+metadata by invoking the `hasMetadata` method:
+
+```java
+public boolean hasMetadata();
+```
+
+`MetadataExtractor` provides convenient functions for you to get the
+input/output tensors' metadata. For example,
+
+```java
+public int getInputTensorCount();
+public TensorMetadata getInputTensorMetadata(int inputIndex);
+public QuantizationParams getInputTensorQuantizationParams(int inputIndex);
+public int[] getInputTensorShape(int inputIndex);
+public int getoutputTensorCount();
+public TensorMetadata getoutputTensorMetadata(int inputIndex);
+public QuantizationParams getoutputTensorQuantizationParams(int inputIndex);
+public int[] getoutputTensorShape(int inputIndex);
+```
+
+You can also read associated files through their names with the
+`getAssociatedFile` method:
+
+```java
+public InputStream getAssociatedFile(String fileName);
+```
+
+Though the
+[TensorFlow Lite model schema](https://github.com/tensorflow/tensorflow/blob/aa7ff6aa28977826e7acae379e82da22482b2bf2/tensorflow/lite/schema/schema.fbs#L1075)
+supports multiple subgraphs, the TFLite Interpreter currently only supports a
+single subgraph. Therefore, `MetadataExtractor` omits subgraph index as an input
+argument in its methods.
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index 72eb07aa34b..26885debdf9 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -92,187 +92,6 @@ To learn more about `abiFilters`, see
 [`NdkOptions`](https://google.github.io/android-gradle-dsl/current/com.android.build.gradle.internal.dsl.NdkOptions.html)
 in the Android Gradle documentation.
 
-### Build TensorFlow Lite locally
-
-In some cases, you might wish to use a local build of TensorFlow Lite. For
-example, you may be building a custom binary that includes
-[operations selected from TensorFlow](https://www.tensorflow.org/lite/guide/ops_select),
-or you may wish to make local changes to TensorFlow Lite.
-
-#### Set up build environment using Docker
-
-*   Download the Docker file. By downloading the Docker file, you agree that the
-    following terms of service govern your use thereof:
-
-*By clicking to accept, you hereby agree that all use of the Android Studio and
-Android Native Development Kit will be governed by the Android Software
-Development Kit License Agreement available at
-https://developer.android.com/studio/terms (such URL may be updated or changed
-by Google from time to time).*
-
-{% dynamic if 'tflite-android-tos' in user.acknowledged_walls and request.tld !=
-'cn' %} You can download the Docker file
-<a href="https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/dockerfiles/tflite-android.Dockerfile">here</a>
-{% dynamic else %} You must acknowledge the terms of service to download the
-file.
-<a class="button button-blue devsite-acknowledgement-link" data-globally-unique-wall-id="tflite-android-tos">Acknowledge</a>
-{% dynamic endif %}
-
-*   You can optionally change the Android SDK or NDK version. Put the downloaded
-    Docker file in an empty folder and build your docker image by running:
-
-```shell
-docker build . -t tflite-builder -f tflite-android.Dockerfile
-```
-
-*   Start the docker container interactively by mounting your current folder to
-    /tmp inside the container (note that /tensorflow_src is the TensorFlow
-    repository inside the container):
-
-```shell
-docker run -it -v $PWD:/tmp tflite-builder bash
-```
-
-If you use PowerShell on Windows, replace "$PWD" with "pwd".
-
-If you would like to use a TensorFlow repository on the host, mount that host
-directory instead (-v hostDir:/tmp).
-
-*   Once you are inside the container, you can run the following to download
-    additional Android tools and libraries (note that you may need to accept the
-    license):
-
-```shell
-android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
-```
-
-You can now proceed to the "Build and Install" section. After you are finished
-building the libraries, you can copy them to /tmp inside the container so that
-you can access them on the host.
-
-#### Set up build environment without Docker
-
-##### Install Bazel and Android Prerequisites
-
-Bazel is the primary build system for TensorFlow. To build with it, you must
-have it and the Android NDK and SDK installed on your system.
-
-1.  Install the latest version of the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
-2.  The Android NDK is required to build the native (C/C++) TensorFlow Lite
-    code. The current recommended version is 17c, which may be found
-    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-17c-downloads).
-3.  The Android SDK and build tools may be obtained
-    [here](https://developer.android.com/tools/revisions/build-tools.html), or
-    alternatively as part of
-    [Android Studio](https://developer.android.com/studio/index.html). Build
-    tools API >= 23 is the recommended version for building TensorFlow Lite.
-
-##### Configure WORKSPACE and .bazelrc
-
-Run the `./configure` script in the root TensorFlow checkout directory, and
-answer "Yes" when the script asks to interactively configure the `./WORKSPACE`
-for Android builds. The script will attempt to configure settings using the
-following environment variables:
-
-*   `ANDROID_SDK_HOME`
-*   `ANDROID_SDK_API_LEVEL`
-*   `ANDROID_NDK_HOME`
-*   `ANDROID_NDK_API_LEVEL`
-
-If these variables aren't set, they must be provided interactively in the script
-prompt. Successful configuration should yield entries similar to the following
-in the `.tf_configure.bazelrc` file in the root folder:
-
-```shell
-build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r17c"
-build --action_env ANDROID_NDK_API_LEVEL="21"
-build --action_env ANDROID_BUILD_TOOLS_VERSION="28.0.3"
-build --action_env ANDROID_SDK_API_LEVEL="23"
-build --action_env ANDROID_SDK_HOME="/usr/local/android/android-sdk-linux"
-```
-
-#### Build and install
-
-Once Bazel is properly configured, you can build the TensorFlow Lite AAR from
-the root checkout directory as follows:
-
-```sh
-bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  //tensorflow/lite/java:tensorflow-lite
-```
-
-This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. Note
-that this builds a "fat" AAR with several different architectures; if you don't
-need all of them, use the subset appropriate for your deployment environment.
-
-Caution: Following feature is experimental and only available at HEAD. You can
-build smaller AAR files targeting only a set of models as follows:
-
-```sh
-bash tensorflow/lite/tools/build_aar.sh \
-  --input_models=model1,model2 \
-  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
-```
-
-Above script will generate the `tensorflow-lite.aar` file and optionally the
-`tensorflow-lite-select-tf-ops.aar` file if one of the models is using
-Tensorflow ops.
-
-##### Add AAR directly to project
-
-Move the `tensorflow-lite.aar` file into a directory called `libs` in your
-project. Modify your app's `build.gradle` file to reference the new directory
-and replace the existing TensorFlow Lite dependency with the new local library,
-e.g.:
-
-```
-allprojects {
-    repositories {
-        jcenter()
-        flatDir {
-            dirs 'libs'
-        }
-    }
-}
-
-dependencies {
-    compile(name:'tensorflow-lite', ext:'aar')
-}
-```
-
-##### Install AAR to local Maven repository
-
-Execute the following command from your root checkout directory:
-
-```sh
-mvn install:install-file \
-  -Dfile=bazel-bin/tensorflow/lite/java/tensorflow-lite.aar \
-  -DgroupId=org.tensorflow \
-  -DartifactId=tensorflow-lite -Dversion=0.1.100 -Dpackaging=aar
-```
-
-In your app's `build.gradle`, ensure you have the `mavenLocal()` dependency and
-replace the standard TensorFlow Lite dependency with the one that has support
-for select TensorFlow ops:
-
-```
-allprojects {
-    repositories {
-        jcenter()
-        mavenLocal()
-    }
-}
-
-dependencies {
-    implementation 'org.tensorflow:tensorflow-lite:0.1.100'
-}
-```
-
-Note that the `0.1.100` version here is purely for the sake of
-testing/development. With the local AAR installed, you can use the standard
-[TensorFlow Lite Java inference APIs](../guide/inference.md) in your app code.
-
 ## Build Android app using C++
 
 There are two ways to use TFLite through C++ if you build your app with the NDK:
diff --git a/tensorflow/lite/g3doc/guide/build_android.md b/tensorflow/lite/g3doc/guide/build_android.md
new file mode 100644
index 00000000000..792c609bc0e
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/build_android.md
@@ -0,0 +1,190 @@
+# Build TensorFlow Lite for Android
+
+This document describes how to build TensorFlow Lite Android library on your
+own. Normally, you do not need to locally build TensorFlow Lite Android library.
+If you just want to use it, the easiest way is using the
+[TensorFlow Lite AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite).
+See [Android quickstart](../guide/android.md) for more details on how to use
+them in your Android projects.
+
+## Build TensorFlow Lite locally
+
+In some cases, you might wish to use a local build of TensorFlow Lite. For
+example, you may be building a custom binary that includes
+[operations selected from TensorFlow](https://www.tensorflow.org/lite/guide/ops_select),
+or you may wish to make local changes to TensorFlow Lite.
+
+### Set up build environment using Docker
+
+*   Download the Docker file. By downloading the Docker file, you agree that the
+    following terms of service govern your use thereof:
+
+*By clicking to accept, you hereby agree that all use of the Android Studio and
+Android Native Development Kit will be governed by the Android Software
+Development Kit License Agreement available at
+https://developer.android.com/studio/terms (such URL may be updated or changed
+by Google from time to time).*
+
+{% dynamic if 'tflite-android-tos' in user.acknowledged_walls and request.tld !=
+'cn' %} You can download the Docker file
+<a href="https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/dockerfiles/tflite-android.Dockerfile">here</a>
+{% dynamic else %} You must acknowledge the terms of service to download the
+file.
+<a class="button button-blue devsite-acknowledgement-link" data-globally-unique-wall-id="tflite-android-tos">Acknowledge</a>
+{% dynamic endif %}
+
+*   You can optionally change the Android SDK or NDK version. Put the downloaded
+    Docker file in an empty folder and build your docker image by running:
+
+```shell
+docker build . -t tflite-builder -f tflite-android.Dockerfile
+```
+
+*   Start the docker container interactively by mounting your current folder to
+    /tmp inside the container (note that /tensorflow_src is the TensorFlow
+    repository inside the container):
+
+```shell
+docker run -it -v $PWD:/tmp tflite-builder bash
+```
+
+If you use PowerShell on Windows, replace "$PWD" with "pwd".
+
+If you would like to use a TensorFlow repository on the host, mount that host
+directory instead (-v hostDir:/tmp).
+
+*   Once you are inside the container, you can run the following to download
+    additional Android tools and libraries (note that you may need to accept the
+    license):
+
+```shell
+android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
+```
+
+You can now proceed to the "Build and Install" section. After you are finished
+building the libraries, you can copy them to /tmp inside the container so that
+you can access them on the host.
+
+### Set up build environment without Docker
+
+#### Install Bazel and Android Prerequisites
+
+Bazel is the primary build system for TensorFlow. To build with it, you must
+have it and the Android NDK and SDK installed on your system.
+
+1.  Install the latest version of the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
+2.  The Android NDK is required to build the native (C/C++) TensorFlow Lite
+    code. The current recommended version is 17c, which may be found
+    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-17c-downloads).
+3.  The Android SDK and build tools may be obtained
+    [here](https://developer.android.com/tools/revisions/build-tools.html), or
+    alternatively as part of
+    [Android Studio](https://developer.android.com/studio/index.html). Build
+    tools API >= 23 is the recommended version for building TensorFlow Lite.
+
+#### Configure WORKSPACE and .bazelrc
+
+Run the `./configure` script in the root TensorFlow checkout directory, and
+answer "Yes" when the script asks to interactively configure the `./WORKSPACE`
+for Android builds. The script will attempt to configure settings using the
+following environment variables:
+
+*   `ANDROID_SDK_HOME`
+*   `ANDROID_SDK_API_LEVEL`
+*   `ANDROID_NDK_HOME`
+*   `ANDROID_NDK_API_LEVEL`
+
+If these variables aren't set, they must be provided interactively in the script
+prompt. Successful configuration should yield entries similar to the following
+in the `.tf_configure.bazelrc` file in the root folder:
+
+```shell
+build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r17c"
+build --action_env ANDROID_NDK_API_LEVEL="21"
+build --action_env ANDROID_BUILD_TOOLS_VERSION="28.0.3"
+build --action_env ANDROID_SDK_API_LEVEL="23"
+build --action_env ANDROID_SDK_HOME="/usr/local/android/android-sdk-linux"
+```
+
+### Build and install
+
+Once Bazel is properly configured, you can build the TensorFlow Lite AAR from
+the root checkout directory as follows:
+
+```sh
+bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  //tensorflow/lite/java:tensorflow-lite
+```
+
+This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. Note
+that this builds a "fat" AAR with several different architectures; if you don't
+need all of them, use the subset appropriate for your deployment environment.
+
+Caution: Following feature is experimental and only available at HEAD. You can
+build smaller AAR files targeting only a set of models as follows:
+
+```sh
+bash tensorflow/lite/tools/build_aar.sh \
+  --input_models=model1,model2 \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
+```
+
+Above script will generate the `tensorflow-lite.aar` file and optionally the
+`tensorflow-lite-select-tf-ops.aar` file if one of the models is using
+Tensorflow ops. For more details, please see the
+[Reduce TensorFlow Lite binary size](../guide/reduce_binary_size.md) section.
+
+#### Add AAR directly to project
+
+Move the `tensorflow-lite.aar` file into a directory called `libs` in your
+project. Modify your app's `build.gradle` file to reference the new directory
+and replace the existing TensorFlow Lite dependency with the new local library,
+e.g.:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        flatDir {
+            dirs 'libs'
+        }
+    }
+}
+
+dependencies {
+    compile(name:'tensorflow-lite', ext:'aar')
+}
+```
+
+#### Install AAR to local Maven repository
+
+Execute the following command from your root checkout directory:
+
+```sh
+mvn install:install-file \
+  -Dfile=bazel-bin/tensorflow/lite/java/tensorflow-lite.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite -Dversion=0.1.100 -Dpackaging=aar
+```
+
+In your app's `build.gradle`, ensure you have the `mavenLocal()` dependency and
+replace the standard TensorFlow Lite dependency with the one that has support
+for select TensorFlow ops:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        mavenLocal()
+    }
+}
+
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite:0.1.100'
+}
+```
+
+Note that the `0.1.100` version here is purely for the sake of
+testing/development. With the local AAR installed, you can use the standard
+[TensorFlow Lite Java inference APIs](../guide/inference.md) in your app code.
diff --git a/tensorflow/lite/g3doc/guide/build_arm64.md b/tensorflow/lite/g3doc/guide/build_arm64.md
index 30ad231cabf..c07c81cd69b 100644
--- a/tensorflow/lite/g3doc/guide/build_arm64.md
+++ b/tensorflow/lite/g3doc/guide/build_arm64.md
@@ -1,22 +1,22 @@
 # Build TensorFlow Lite for ARM64 boards
 
-This page describes how to build the TensorFlow Lite static library for
-ARM64-based computers. If you just want to start using TensorFlow Lite to
+This page describes how to build the TensorFlow Lite static and shared libraries
+for ARM64-based computers. If you just want to start using TensorFlow Lite to
 execute your models, the fastest option is to install the TensorFlow Lite
 runtime package as shown in the [Python quickstart](python.md).
 
-Note: This page shows how to compile only the C++ static library for
-TensorFlow Lite. Alternative install options include: [install just the Python
-interpreter API](python.md) (for inferencing only); [install the full
-TensorFlow package from pip](https://www.tensorflow.org/install/pip);
-or [build the full TensorFlow package](
-https://www.tensorflow.org/install/source).
+Note: This page shows how to compile only the C++ static and shared libraries
+for TensorFlow Lite. Alternative install options include:
+[install just the Python interpreter API](python.md) (for inferencing only);
+[install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip);
+or
+[build the full TensorFlow package](https://www.tensorflow.org/install/source).
 
-## Cross-compile for ARM64
+## Cross-compile for ARM64 with Make
 
 To ensure the proper build environment, we recommend using one of our TensorFlow
-Docker images such as [tensorflow/tensorflow:nightly-devel](
-https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+Docker images such as
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 To get started, install the toolchain and libs:
 
@@ -28,10 +28,10 @@ sudo apt-get install crossbuild-essential-arm64
 If you are using Docker, you may not use `sudo`.
 
 Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
-Docker image, the repo is already provided in `/tensorflow_src/`—and then run
-this script at the root of the TensorFlow repository to download all the
-build dependencies:
+(https://github.com/tensorflow/tensorflow)—if you're using the TensorFlow Docker
+image, the repo is already provided in `/tensorflow_src/`—and then run this
+script at the root of the TensorFlow repository to download all the build
+dependencies:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
@@ -59,8 +59,8 @@ sudo apt-get install build-essential
 ```
 
 Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`) and run this at the root of
-the repository:
+(https://github.com/tensorflow/tensorflow) and run this at the root of the
+repository:
 
 ```bash
 ./tensorflow/lite/tools/make/download_dependencies.sh
@@ -76,3 +76,68 @@ Then compile:
 
 This should compile a static library in:
 `tensorflow/lite/tools/make/gen/linux_aarch64/lib/libtensorflow-lite.a`.
+
+## Cross-compile for ARM64 with Bazel
+
+You can use
+[ARM GCC toolchains](https://github.com/tensorflow/tensorflow/tree/master/third_party/toolchains/embedded/arm-linux)
+with Bazel to build an ARM64 shared library.
+
+Note: The generated shared library requires glibc 2.28 or higher to run.
+
+The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
+and TensorFlow devel docker image
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+To cross compile TensorFlow Lite with Bazel, follow the steps:
+
+#### Step 1. Install Bazel
+
+Bazel is the primary build system for TensorFlow. Install the latest version of
+the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
+
+**Note:** If you're using the TensorFlow Docker image, Bazel is already
+available.
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
+
+#### Step 3. Build ARM64 binary
+
+##### C library
+
+```bash
+bazel build --config=elinux_aarch64 -c opt //tensorflow/lite/c:libtensorflowlite_c.so
+```
+
+Check
+[TensorFlow Lite C API](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/c)
+page for the detail.
+
+##### C++ library
+
+```bash
+bazel build --config=elinux_aarch64 -c opt //tensorflow/lite:libtensorflowlite.so
+```
+
+You can find a shared library library in:
+`bazel-bin/tensorflow/lite/libtensorflowlite.so`.
+
+Currently, there is no straightforward way to extract all header files needed,
+so you must include all header files in tensorflow/lite/ from the TensorFlow
+repository. Additionally, you will need header files from FlatBuffers and
+Abseil.
+
+##### Etc
+
+You can also build other Bazel targets with the toolchain. Here are some useful
+targets.
+
+*   //tensorflow/lite/tools/benchmark:benchmark_model
+*   //tensorflow/lite/examples/label_image:label_image
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 0f49ed91315..3b420926991 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -1,66 +1,83 @@
 # Build TensorFlow Lite for Raspberry Pi
 
-This page describes how to build the TensorFlow Lite static library for
-Raspberry Pi. If you just want to start using TensorFlow Lite to execute your
-models, the fastest option is to install the TensorFlow Lite runtime package as
-shown in the [Python quickstart](python.md).
+This page describes how to build the TensorFlow Lite static and shared libraries
+for Raspberry Pi. If you just want to start using TensorFlow Lite to execute
+your models, the fastest option is to install the TensorFlow Lite runtime
+package as shown in the [Python quickstart](python.md).
 
-**Note:** This page shows how to compile only the C++ static library for
+**Note:** This page shows how to compile the C++ static and shared libraries for
 TensorFlow Lite. Alternative install options include:
 [install just the Python interpreter API](python.md) (for inferencing only);
 [install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip);
 or
 [build the full TensorFlow package](https://www.tensorflow.org/install/source_rpi).
 
-## Cross-compile for Raspberry Pi
+**Note:** This page only covers 32-bit builds. If you're looking for 64-bit
+builds, check [Build for ARM64](build_arm64.md) page.
+
+## Cross-compile for Raspberry Pi with Make
 
 The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
 and TensorFlow devel docker image
-[tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 To cross compile TensorFlow Lite follow the steps:
 
-1.  Clone official Raspberry Pi cross-compilation toolchain:
+#### Step 1. Clone official Raspberry Pi cross-compilation toolchain
 
-    ```sh
-    git clone https://github.com/raspberrypi/tools.git rpi_tools
-    ```
+```sh
+git clone https://github.com/raspberrypi/tools.git rpi_tools
+```
 
-2.  Clone TensorFlow repository:
+#### Step 2. Clone TensorFlow repository
 
-    ```sh
-    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
-    ```
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
 
-    **Note:** If you're using the TensorFlow Docker image, the repo is already
-    provided in `/tensorflow_src/`.
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
 
-3.  Run following script at the root of the TensorFlow repository to download
-    all the build dependencies:
+#### Step 3. Run following script at the root of the TensorFlow repository to download
 
-    ```sh
-    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
-    ```
+all the build dependencies:
 
-    **Note:** You only need to do this once.
+```sh
+cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+```
 
-4.  To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute:
+**Note:** You only need to do this once.
 
-    ```sh
-    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh
-    ```
+#### Step 4a. To build ARMv7 binary for Raspberry Pi 2, 3 and 4
 
-    **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+```sh
+PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH \
+  ./tensorflow/lite/tools/make/build_rpi_lib.sh
+```
 
-5.  To build ARMv6 binary for Raspberry Pi Zero execute:
+**Note:** This should compile a static library in:
+`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
 
-    ```sh
-    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
-    ```
+You can add additional Make options or target names to the `build_rpi_lib.sh`
+script since it's a wrapper of Make with TFLite
+[Makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/make/Makefile).
+Here are some possible options:
 
-    **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
+```sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh clean # clean object files
+./tensorflow/lite/tools/make/build_rpi_lib.sh -j 16 # run with 16 jobs to leverage more CPU cores
+./tensorflow/lite/tools/make/build_rpi_lib.sh label_image # # build label_image binary
+```
+
+#### Step 4b. To build ARMv6 binary for Raspberry Pi Zero
+
+```sh
+PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH \
+  ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
+```
+
+**Note:** This should compile a static library in:
+`tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
 
 ## Compile natively on Raspberry Pi
 
@@ -69,32 +86,97 @@ GNU/Linux 10 (buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
 
 To natively compile TensorFlow Lite follow the steps:
 
-1.  Log in to your Raspberry Pi and install the toolchain:
+#### Step 1. Log in to your Raspberry Pi and install the toolchain
 
-    ```sh
-    sudo apt-get install build-essential
-    ```
+```sh
+sudo apt-get install build-essential
+```
 
-2.  Clone TensorFlow repository:
+#### Step 2. Clone TensorFlow repository
 
-    ```sh
-    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
-    ```
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
 
-3.  Run following script at the root of the TensorFlow repository to download
-    all the build dependencies:
+#### Step 3. Run following script at the root of the TensorFlow repository to download all the build dependencies
 
-    ```sh
-    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
-    ```
+```sh
+cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+```
 
-    **Note:** You only need to do this once.
+**Note:** You only need to do this once.
 
-4.  You should then be able to compile TensorFlow Lite with:
+#### Step 4. You should then be able to compile TensorFlow Lite with:
 
-    ```sh
-    ./tensorflow/lite/tools/make/build_rpi_lib.sh
-    ```
+```sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh
+```
 
-    **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.
+**Note:** This should compile a static library in:
+`tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.
+
+## Cross-compile for armhf with Bazel
+
+You can use
+[ARM GCC toolchains](https://github.com/tensorflow/tensorflow/tree/master/third_party/toolchains/embedded/arm-linux)
+with Bazel to build an armhf shared library which is compatibile with Raspberry
+Pi 2, 3 and 4.
+
+Note: The generated shared library requires glibc 2.28 or higher to run.
+
+The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
+and TensorFlow devel docker image
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+To cross compile TensorFlow Lite with Bazel, follow the steps:
+
+#### Step 1. Install Bazel
+
+Bazel is the primary build system for TensorFlow. Install the latest version of
+the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
+
+**Note:** If you're using the TensorFlow Docker image, Bazel is already
+available.
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
+
+#### Step 3. Build ARMv7 binary for Raspberry Pi 2, 3 and 4
+
+##### C library
+
+```bash
+bazel build --config=elinux_armhf -c opt //tensorflow/lite/c:libtensorflowlite_c.so
+```
+
+Check
+[TensorFlow Lite C API](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/c)
+page for the detail.
+
+##### C++ library
+
+```bash
+bazel build --config=elinux_armhf -c opt //tensorflow/lite:libtensorflowlite.so
+```
+
+You can find a shared library library in:
+`bazel-bin/tensorflow/lite/libtensorflowlite.so`.
+
+Currently, there is no straightforward way to extract all header files needed,
+so you must include all header files in tensorflow/lite/ from the TensorFlow
+repository. Additionally, you will need header files from FlatBuffers and
+Abseil.
+
+##### Etc
+
+You can also build other Bazel targets with the toolchain. Here are some useful
+targets.
+
+*   //tensorflow/lite/tools/benchmark:benchmark_model
+*   //tensorflow/lite/examples/label_image:label_image
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index fbf03ab84b5..9b3ebf45991 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -84,7 +84,7 @@ platform specific wrapper code. The wrapper code removes the need to interact
 directly with `ByteBuffer` on Android. Instead, developers can interact with the
 TensorFlow Lite model with typed objects such as `Bitmap` and `Rect`. For more
 information, please refer to the
-[TensorFlow Lite Android wrapper code generator](codegen.md).
+[TensorFlow Lite Android wrapper code generator](../inference_with_metadata/codegen.md).
 
 ### iOS
 
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index d1462cb09c7..52f2a2fdf17 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -29,49 +29,11 @@ requires "fake-quantization" during model training, getting range information
 via a calibration data set, or doing "on-the-fly" range estimation. See
 [quantization](../performance/model_optimization.md).
 
-## Data format and broadcasting
+## Supported operations and restrictions
 
-At the moment TensorFlow Lite supports only TensorFlow's "NHWC" format, and
-broadcasting is only support in a limited number of ops (`tf.add`, `tf.mul`,
-`tf.sub`, and `tf.div`).
-
-## Compatible operations
-
-The following TensorFlow operations are usually mapped to their TensorFlow Lite
-counterparts:
-
-*   `tf.batch_to_space_nd` —As long as the input tensor is 3D or 4D (1 batch + 1
-    or 2 spatial + 1 other) and the crops attribute is not used.
-*   `tf.exp`
-*   `tf.fake_quant`
-*   `tf.matmul` —As the second argument is constant and transposition is not
-    used*
-*   `tf.nn.avg_pool`
-*   `tf.nn.conv2d` —As long as the filter is constant.
-*   `tf.nn.depthwise_conv2d` —As long as the filter is constant and rate is `[1,
-    1]`.
-*   `tf.nn.l2_normalize` —As long as normalization is done along the last
-    dimension.
-*   `tf.nn.local_response_normalization`
-*   `tf.nn.log_softmax` —As long as axis is not provided.
-*   `tf.nn.max_pool`
-*   `tf.nn.softmax` —As long as tensors are 2D and axis is the last dimension.
-*   `tf.nn.top_k`
-*   `tf.one_hot`
-*   `tf.pad` —As long as `mode` and `constant_values` are not used.
-*   `tf.reduce_mean` —As long as the `reduction_indices` attribute is not used.
-*   `tf.reshape`
-*   `tf.sigmoid`
-*   `tf.space_to_batch_nd` —As long as the input tensor is 3D or 4D (1 batch + 1
-    or 2 spatial + 1 other).
-*   `tf.space_to_depth`
-*   `tf.split` —As long as num is not provided and `num_or_size_split` contains
-    number of splits as a 0D tensor.
-*   `tf.squeeze` —As long as `axis` is not provided.
-*   `tf.squared_difference`
-*   `tf.strided_slice` —As long as `ellipsis_mask` and `new_axis_mask` are not
-    used.
-*   `tf.transpose` —As long as `conjugate` is not used.
+TensorFlow Lite supports a subset of TensorFlow operations with some
+limitations. For full list of operations and limitations see
+[TF Lite Ops page](https://www.tensorflow.org/mlir/tfl_ops).
 
 ## Straight-forward conversions, constant-folding and fusing
 
@@ -118,1029 +80,7 @@ from the graph:
 Note: Many of those operations don't have TensorFlow Lite equivalents, and the
 corresponding model will not be convertible if they can't be elided or fused.
 
-## Unsupported operations
-
-TensorFlow operation not listed above are likely unsupported. Notably, the
-following common ops are not supported at the moment:
-
-*   `tf.depth_to_space`
-
-## TensorFlow Lite operations
-
-The following TensorFlow Lite operations are fully supported and used in place
-of the TensorFlow operations listed above:
-
-**ABS**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: elementwise abs of the input
-}
-```
-
-**ADD**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise sum of the input tensors
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**ADD_N**
-
-```
-Inputs {
-  0-N: any number of tensors (must have same size and shape)
-}
-Outputs {
-  0: elementwise sum of the input tensors
-}
-```
-
-**ARG_MAX**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of maximum values.
-}
-```
-
-**ARG_MIN**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of minimum values.
-}
-```
-
-**AVERAGE_POOL_2D**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor where each entry is the mean of the input values in the
-     corresponding window.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the sliding window
-  filter_width,filter_height: size of the sliding window
-}
-```
-
-**BATCH_TO_SPACE_ND**
-
-```
-Inputs {
-  0: 3D-4D tensor
-  1: 1D tensor
-  2: 2D tensor
-}
-Outputs {
-  0: tensor rearranged using block_shape. See tf.batch_to_space_nd for
-     details.
-}
-```
-
-**CONCATENATION**
-
-```
-Inputs {
-  0-N: any number of tensors
-}
-Outputs {
-  0: concatenation of the input tensors along the given axis.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  axis: dimension along which the concatenation is performed
-}
-```
-
-**CONV_2D**
-
-```
-Inputs {
-  0: 4D tensor
-  1: filter
-  2: bias (optional)
-}
-Outputs {
-  0: result of 2D convolution of the input tensor
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the filter window
-}
-```
-
-**TRANSPOSE_CONV**
-
-```
-Inputs {
-  0: output_shape
-  1: filter
-  2: 4D tensor
-}
-Outputs {
-  0: the transpose (gradient) of conv2d
-}
-Options {
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the filter window
-}
-```
-
-**DEPTHWISE_CONV_2D**
-
-```
-Inputs {
-  0: 4D tensor
-  1: filter
-  2: bias (optional)
-}
-Outputs {
-  0: result of a depthwise-2D convolution of the input tensor
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the filter window
-  depth_multiplier: relation between the last dimension of the input and output
-    tensors
-}
-```
-
-**ELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to exp(features) - 1 if < 0, features otherwise.
-}
-```
-
-**EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is
-  equal to the corresponding element of the second tensor.
-}
-```
-
-**EXP**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs {
-  0: result of computing element-wise exponential of the input tensor
-}
-```
-
-**FILL**
-
-```
-Inputs {
-  0: a 1D tensor
-  1: a 0D (scalar) tensor
-}
-Outputs {
-  0: A tensor of shape `tensor 0` filled with the value in `tensor 1`.
-}
-```
-
-**FLOOR**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs: {
-  0: result of computing element-wise floor of the input tensor
-}
-```
-
-**FLOOR_DIV**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: result of computing element-wise floor of `tensor 0` divided by `tensor 1`.
-}
-```
-
-**FLOOR_MOD**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: result of computing element-wise floor of `tensor 0` modulo `tensor 1`.
-}
-```
-
-**CEIL**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise ceil of the input tensor
-}
-```
-
-**FULLY_CONNECTED**
-
-```
-Inputs {
-  0: 4D tensor
-  1: filter
-  2: bias (optional)
-}
-Outputs {
-  0: output of a fully (densely) connected layer, which connects all
-     elements in the input tensor with each element in this tensor.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**GATHER**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-  2: axis tensor (optional)
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
-**GATHER_ND**
-
-```
-Inputs {
-  0: params tensor
-  1: indices tensor
-}
-Outputs {
-  0: a tensor with same type as the params tensor.
-}
-```
-
-**GREATER**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is
-  greater than the corresponding element of the second tensor.
-}
-```
-
-**GREATER_EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is
-  greater than or equal to the corresponding element of the second tensor.
-}
-```
-
-**L2_NORMALIZATION**
-
-```
-Inputs {
-  0: input tensor
-}
-Outputs {
-  0: normalized tensor (along the last dimension)
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**L2_POOL_2D**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to tf.sqrt(tf.nn.ave_pool(tf.square(input))
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the sliding window
-  filter_width,filter_height: size of the sliding window
-}
-```
-
-**LEAKY_RELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(input, input * alpha)
-}
-Options {
-  alpha: slope of the activation at x < 0 (provided alpha <= 1)
-}
-```
-
-**LESS**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is less
-  than the corresponding element of the second tensor.
-}
-```
-
-**LESS_EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is less
-  than or equal to the corresponding element of the second tensor.
-}
-```
-
-**LOCAL_RESPONSE_NORMALIZATION**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to tf.nn.local_response_normalization
-}
-Options {
-  radius
-  bias
-  alpha
-  beta
-}
-```
-
-**LOGICAL_OR**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of logical_or output tensors.
-}
-```
-
-**LOGISTIC**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to 1 / (1 + exp(-input))
-}
-```
-
-**LOG**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to log(input)
-}
-```
-
-**LOG_SOFTMAX**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs {
-  0: tensor equivalent to logits - log(reduce_sum(exp(logits), -1))
-}
-```
-
-**MAX_POOL_2D**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor where each entry is the maximum of the input values in the
-     corresponding window.
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-  padding: SAME|VALID
-  stride_w,stride_h: stride of the sliding window
-  filter_width,filter_height: size of the sliding window
-}
-```
-
-**MUL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise multiplication of the input tensors
-}
-Options {
-  fused_activation_function:  NONE|RELU|RELU6
-}
-```
-
-**NEG**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: elementwise negation of the input tensor
-}
-```
-
-**NON_MAX_SUPPRESSION_V4**
-
-```
-Inputs {
-  0: boxes in format [y1, x1, y2, x2]
-  1: scores
-  2: max number of detections
-  3: IOU threshold
-  4: score threshold
-}
-Outputs {
-  0: selected indices
-  1: number of selected indices
-}
-```
-
-**NON_MAX_SUPPRESSION_V5**
-
-```
-Inputs {
-  0: boxes in format [y1, x1, y2, x2]
-  1: scores
-  2: max number of detections
-  3: IOU threshold
-  4: score threshold
-  5: soft NMS sigma
-}
-Outputs {
-  0: selected indices
-  1: selected scores
-  2: number of selected indices
-}
-```
-
-**PACK**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: an integer.
-}
-Outputs {
-  0: A tensor of stacked tensors.
-}
-```
-
-**PAD**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-}
-Outputs {
-  0: tensor where additional values are added before and after the contents of
-     each dimension
-}
-```
-
-**MEAN (tf.reduce_mean)**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-}
-Outputs {
-  0: tensor containing the mean of the elements
-}
-Options {
-  keep_dims: whether to retain reduced dimensions
-}
-```
-
-**NOT_EQUAL**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: a tensor of type bool, true whenever an element of the first tensor is not
-  equal to the corresponding element of the second tensor.
-}
-```
-
-**POW**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise pow of the input tensors
-}
-```
-
-**RANGE**
-
-```
-Inputs {
-  0: a 0D (scalar) tensor
-  1: a 0D (scalar) tensor
-  2: a 0D (scalar) tensor
-}
-Outputs {
-  0: A 1D tensor of type `dtype` defined by a sequence where `tensor 0` is the
-  start, `tensor 1` is the limit, and `tensor 2` is the delta.
-}
-Options {
-  dtype
-}
-```
-
-**RANK**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a 0-D int32 Tensor representing the rank of input
-}
-```
-
-**RELU**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(0, input)
-}
-```
-
-**RELU_N1_TO_1**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(-1, min(input, 1)
-}
-```
-
-**RELU6**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to max(0, min(input, 6)
-}
-```
-
-**RESHAPE**
-
-```
-Inputs {
-  0: a tensor
-  1: ignored
-}
-Outputs {
-  0: a tensor with the same elements as the input but with the new shape
-}
-Options {
-  new_shape
-}
-```
-
-**RESIZE_BILINEAR**
-
-```
-Inputs {
-  0: a 4D tensor
-  1: a 1D tensor with 2 elements
-}
-Outputs {
-  0: A tensor of type `tensor 0` resized according to `tensor 1` height/width values
-  using bilinear interpolation.
-}
-Options {
-  align_corners
-}
-```
-
-**RESIZE_NEAREST_NEIGHBOR**
-
-```
-Inputs {
-  0: a 4D tensor
-  1: a 1D tensor with 2 elements
-}
-Outputs {
-  0: A tensor of type `tensor 0` resized according to `tensor 1` height/width values
-  using nearest neighbors interpolation.
-}
-Options {
-  align_corners
-}
-```
-
-**RSQRT**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise reciprocal square root of the input tensor
-}
-```
-
-**REVERSE_SEQUENCE**
-
-```
-Inputs {
-  0: a tensor
-  1: a 1-D tensor which specifies the length of sequence to be reversed in each
-  dim
-}
-Outputs {
-  0: a tensor with the same shape as the input tensor
-}
-Options {
-  seq_dim: a 0-D int tensor (scalar). The dimension which is partially
-  reversed.
-  batch_dim: a 0-D int tensor (scalar). Defaults to 0. The dimension along
-  which reversal is performed.
-}
-```
-
-**SHAPE**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a 1D tensor representing the shape of the input tensor
-}
-Options {
-  out_type: the output type of the op (int32 or int64). Defaults to int32.
-}
-```
-
-**ROUND**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise round of the input tensor
-}
-```
-
-**SLICE**
-
-```
-Inputs {
-  0: tensor
-  1: 1D tensor
-  2: 1D tensor
-}
-Outputs {
-  0: slice of the input tensor of the given size from the given begin index.
-}
-```
-
-**SOFTMAX**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: a tensor equivalent to exp(input) / tf.reduce_sum(exp(input * beta), dim),
-     where dim is always the last dimension of the input tensor.
-}
-Options {
-  beta
-}
-```
-
-**SPACE_TO_DEPTH**
-
-```
-Inputs {
-  0: a 4D tensor
-}
-Outputs {
-  0: a tensor rearranged using block_size. See tf.space_to_depth for details.
-}
-Options {
-  block_size
-}
-```
-
-**SPACE_TO_BATCH_ND**
-
-```
-Inputs {
-  0: 3D-4D tensor
-  1: 1D tensor
-  2: 2D tensor
-}
-Outputs {
-  0: a tensor rearranged using block_shape. See tf.space_to_batch_nd for
-     details.
-}
-```
-
-**SPARSE_TO_DENSE**
-
-```
-Inputs {
-  0: 0D or 1D or 2D tensor
-  1: 1D tensor
-  2: 0D or 1D tensor
-  3: 0D tensor
-  4: a boolean value
-}
-Outputs {
-  0: Dense Tensor of shape output_shape. Has the same type as sparse_values.
-}
-```
-
-**SPLIT**
-
-```
-Inputs {
-  0: 0D tensor (axis)
-  1: tensor (input)
-}
-Outputs {
-  0-N: subtensors built from the input tensors
-}
-Options {
-  num_splits: Specifies number of outputs
-}
-```
-
-**SPLIT_V**
-
-```
-Inputs {
-  0: tensor (input)
-  1: 1-D tensor (size_splits)
-  2: 0-D tensor (axis)
-}
-Outputs {
-  0-N: subtensors built from the input tensors
-}
-Options {
-  num_splits: Specifies number of outputs
-}
-```
-
-**SQRT**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise square root of the input tensor
-}
-```
-
-**SQUEEZE**
-
-```
-Inputs {
-  0: tensor
-}
-Outputs {
-  0: tensor without any dimensions of size 1
-}
-Options {
-  squeeze_dims
-}
-```
-
-**STRIDED_SLICE**
-
-```
-Inputs {
-  0: tensor
-  1: 1D tensor
-  2: 1D tensor
-  3: 1D tensor
-}
-Outputs {
-  0: slice of the input tensor of the given size
-}
-Options {
-  begin_mask: mask for begin indices
-  end_mask: mask for end indices
-  shrink_axis_mask: mask that indicates which dimensions to remove
-}
-```
-
-**TANH**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: result of computing element-wise hyperbolic tangent of the input tensor
-}
-```
-
-**TOP_K**
-
-```
-Inputs {
-  0: tensor
-  1: OD tensor
-}
-Outputs {
-  0: k largest element along each last dimensional slice
-  1: indices of values within the last dimension of the input tensor
-}
-```
-
-**TRANSPOSE**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-}
-Outputs {
-  0: tensor permuted according to perm
-}
-```
-
-**SELECT**
-
-```
-Inputs {
-  0: tensor
-  1: tensor
-  2: tensor
-}
-Outputs {
-  0: tensor that contains the elementwise values of 'tensor 1' if the
-  corresponding value of 'tensor 0' is true or the value of 'tensor 2' if false.
-}
-```
-
-**UNPACK**
-
-```
-Inputs {
-  0: a tensor.
-  1: an integer.
-  2: an integer.
-}
-Outputs {
-  0-N: tensors of unpacked tensor.
-}
-```
-
-**WHERE**
-
-```
-Inputs {
-  0: A tensor of type bool.
-  1: A tensor which may have the same shape as condition. If condition is rank
-     1, x may have higher rank, but its first dimension must match the size of
-     condition.
-  2: A tensor with the same shape and type as x.
-}
-Outputs {
-  0: A tensor with the same type and shape as x, y if they are non-None, or
-     a tensor with shape (num_true, dim_size(condition)).
-}
-```
-
-**ZEROS_LIKE**
-
-```
-Inputs {
-  0: a tensor
-}
-Outputs {
-  0: A tensor of the same shape and type as x but filled with zeros
-}
-```
-
-**FILL**
-
-```
-Inputs {
-  0: A Tensor. Must be one of the following types: int32, int64. 1-D. Represents the shape of the output tensor.
-  1: A Tensor. 0-D (scalar). Value to fill the returned tensor.
-}
-Outputs {
-  0: A tensor of the same type as value (input1).
-}
-```
-
+## Experimental Operations
 The following TensorFlow Lite operations are present, but not ready for custom
 models:
 
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 5aa3e96cae2..b9e5b34076a 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -21,6 +21,10 @@ TensorFlow ops when TFLite builtin ops are not sufficient.
 
 Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
 that has a larger binary size than the interpreter with only TFLite builtin ops.
+For Android, It is possible to reduce binary size by selectively linking only
+required Tensorflow ops. For the details, please see the
+[Reduce TensorFlow Lite binary size](../guide/reduce_binary_size.md) section.
+
 Additionally, performance optimizations will not be available for any TensorFlow
 ops in the TensorFlow Lite model.
 
@@ -66,7 +70,7 @@ open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
 The following example shows how to use this feature in the
-[`tflite_convert`](../convert/cmdline_examples.md) command line tool using the
+[`tflite_convert`](../convert/cmdline.md) command line tool using the
 command line flag `target_ops`.
 
 ```sh
@@ -98,8 +102,10 @@ includes the necessary library of TensorFlow ops.
 
 ### Android AAR
 
-For Android, we recommend using the prebuilt [AAR with TensorFlow ops hosted at
-JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-select-tf-ops).
+To reduce the binary size, please build your own custom AAR files as guided in
+the [next section](#building-the-android-aar). If the binary size is not a
+considerable concern, we recommend using the prebuilt
+[AAR with TensorFlow ops hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-select-tf-ops).
 
 You can specify this in your `build.gradle` dependencies by adding it alongside
 the standard TensorFlow Lite AAR as follows:
@@ -112,9 +118,9 @@ dependencies {
 }
 ```
 
-Once you've added the dependency, the necessary delegate for handling
-the graph's TensorFlow ops should be automatically installed for
-graphs that require them.
+Once you've added the dependency, the necessary delegate for handling the
+graph's TensorFlow ops should be automatically installed for graphs that require
+them.
 
 *Note*: The TensorFlow ops dependency is relatively large, so you'll probably
 want to filter out unnecessary x86 ABIs in your `.gradle` file by setting up
@@ -132,23 +138,32 @@ android {
 
 #### Building the Android AAR
 
-For more advanced cases, you can also build the library manually. Assuming a
-<a href="android.md">working TensorFlow Lite build environment</a>, build the
-Android AAR with select TensorFlow ops as follows:
+For reducing the binary size or other advanced cases, you can also build the
+library manually. Assuming a <a href="android.md">working TensorFlow Lite build
+environment</a>, build the Android AAR with select TensorFlow ops as follows:
 
 ```sh
-bazel build --cxxopt='--std=c++14' -c opt   \
-  --config=android_arm --config=monolithic  \
-  //tensorflow/lite/java:tensorflow-lite-select-tf-ops
+sh tensorflow/lite/tools/build_aar.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
 ```
 
-This will generate an AAR file in `bazel-bin/tensorflow/lite/java/`. From there,
-you can either import the AAR directly into your project, or publish the custom
-AAR to your local Maven repository:
+This will generate the AAR file `bazel-bin/tmp/tensorflow-lite.aar` for
+TensorFlow Lite built-in and custom ops; and generate the AAR file
+`bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar` for TensorFlow ops. If you
+don't have a working build environment, You can also
+[build above files with docker](../guide/reduce_binary_size.md#selectively_build_tensorflow_lite_with_docker).
+
+From there, you can either import the AAR files directly into your project, or
+publish the custom AAR files to your local Maven repository:
 
 ```sh
 mvn install:install-file \
-  -Dfile=bazel-bin/tensorflow/lite/java/tensorflow-lite-select-tf-ops.aar \
+  -Dfile=bazel-bin/tmp/tensorflow-lite.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite -Dversion=0.1.100 -Dpackaging=aar
+mvn install:install-file \
+  -Dfile=bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar \
   -DgroupId=org.tensorflow \
   -DartifactId=tensorflow-lite-select-tf-ops -Dversion=0.1.100 -Dpackaging=aar
 ```
@@ -166,7 +181,8 @@ allprojects {
 }
 
 dependencies {
-    implementation 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite-select-tf-ops:0.1.100'
 }
 ```
 
@@ -285,10 +301,16 @@ Using only TF ops (`SELECT_TF_OPS`)  | 264.5
 The following table describes the binary size of TensorFlow Lite for each build.
 These targets were built for Android using `--config=android_arm -c opt`.
 
-Build                 | C++ Binary Size | Android APK Size
---------------------- | --------------- | ----------------
-Only built-in ops     | 796 KB          | 561 KB
-Built-in ops + TF ops | 23.0 MB         | 8.0 MB
+Build                     | C++ Binary Size | Android APK Size
+------------------------- | --------------- | ----------------
+Only built-in ops         | 796 KB          | 561 KB
+Built-in ops + TF ops     | 23.0 MB         | 8.0 MB
+Built-in ops + TF ops (1) | 4.1 MB          | 1.8 MB
+
+(1) These libraries are selectively built for
+[i3d-kinetics-400 model](https://tfhub.dev/deepmind/i3d-kinetics-400/1) with 8
+TFLite builtin ops and 3 Tensorflow ops. For more details, please see the
+[Reduce TensorFlow Lite binary size](../guide/reduce_binary_size.md) section.
 
 ## Known limitations
 
@@ -309,10 +331,6 @@ The following is a list of some of the known limitations:
 
 The following is a list of improvements to this pipeline that are in progress:
 
-*   *Selective registration* - There is work being done to make it simple to
-    generate TFLite interpreter binaries that only contain the TensorFlow ops
-    required for a particular set of models.
-*   *Improved usability* - The conversion process will be simplified to only
-    require a single pass through the converter.
 *   *Improved performance* - Work is being done to ensure TensorFlow Lite with
-    TensorFlow ops has performance parity to TensorFlow Mobile.
+    TensorFlow ops nicely cooperates with hardware accelerated delegates, for
+    example, NNAPI and GPU delegates.
diff --git a/tensorflow/lite/g3doc/guide/python.md b/tensorflow/lite/g3doc/guide/python.md
index 1cef1651517..1f68a0aa5a2 100644
--- a/tensorflow/lite/g3doc/guide/python.md
+++ b/tensorflow/lite/g3doc/guide/python.md
@@ -39,7 +39,7 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
 <table>
 <tr><th>Platform</th><th>Python</th><th>URL</th></tr>
 <tr>
-  <td style="white-space:nowrap" rowspan="3">Linux (ARM 32)</td>
+  <td style="white-space:nowrap" rowspan="4">Linux (ARM 32)</td>
   <td style="white-space:nowrap">3.5</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp35-cp35m-linux_armv7l.whl</td>
 </tr>
@@ -54,7 +54,13 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_armv7l.whl</td>
 </tr>
 <tr>
-  <td style="white-space:nowrap" rowspan="3">Linux (ARM 64)</td>
+  <!-- ARM 32 -->
+  <td style="white-space:nowrap">3.8</td>
+  <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp38-cp38-linux_armv7l.whl</td>
+</tr>
+
+<tr>
+  <td style="white-space:nowrap" rowspan="4">Linux (ARM 64)</td>
   <td style="white-space:nowrap">3.5</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp35-cp35m-linux_aarch64.whl</td>
 </tr>
@@ -69,7 +75,13 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_aarch64.whl</td>
 </tr>
 <tr>
-  <td style="white-space:nowrap" rowspan="3">Linux (x86-64)</td>
+  <!-- ARM 64 -->
+  <td style="white-space:nowrap">3.8</td>
+  <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp38-cp38-linux_aarch64.whl</td>
+</tr>
+
+<tr>
+  <td style="white-space:nowrap" rowspan="4">Linux (x86-64)</td>
   <td style="white-space:nowrap">3.5</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp35-cp35m-linux_x86_64.whl</td>
 </tr>
@@ -83,6 +95,11 @@ pip3 install https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-
   <td style="white-space:nowrap">3.7</td>
   <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp37-cp37m-linux_x86_64.whl</td>
 </tr>
+<tr>
+  <!-- x86-64 -->
+  <td style="white-space:nowrap">3.8</td>
+  <td>https://dl.google.com/coral/python/tflite_runtime-2.1.0.post1-cp38-cp38-linux_x86_64.whl</td>
+</tr>
 
 <tr>
   <td style="white-space:nowrap" rowspan="3">macOS 10.14</td>
diff --git a/tensorflow/lite/g3doc/guide/reduce_binary_size.md b/tensorflow/lite/g3doc/guide/reduce_binary_size.md
new file mode 100644
index 00000000000..4d012efd67b
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/reduce_binary_size.md
@@ -0,0 +1,156 @@
+# Reduce TensorFlow Lite binary size
+
+## Overview
+
+When deploying models for on-device machine learning (ODML) applications, it is
+important to be aware of the limited memory that is available on mobile devices.
+Model binary sizes are closely correlated to the number of ops used in the
+model. TensorFlow Lite enables you to reduce model binary sizes by using
+selective builds. Selective builds skip unused operations in your model set and
+produce a compact library with just the runtime and the op kernels required for
+the model to run on your mobile device.
+
+Selective build applies on the following three operations libraries.
+
+1.  [TensorFlow Lite built-in ops library](https://www.tensorflow.org/lite/guide/ops_compatibility)
+1.  [TensorFlow Lite custom ops](https://www.tensorflow.org/lite/guide/ops_custom)
+1.  [Select TensorFlow ops library](https://www.tensorflow.org/lite/guide/ops_select)
+
+The table below demonstrates the impact of selective builds for some common use
+cases:
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Domain</th>
+      <th>Target architecture</th>
+      <th>AAR file size(s)</th>
+    </tr>
+  </thead>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td rowspan = 2>Image classification</td>
+    <td>armeabi-v7a</td>
+    <td>tensorflow-lite.aar (296,635 bytes)</td>
+  </tr>
+   <tr>
+    <td>arm64-v8a</td>
+    <td>tensorflow-lite.aar (382,892 bytes)</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://tfhub.dev/google/lite-model/spice/">SPICE</a>
+    </td>
+    <td rowspan = 2>Sound pitch extraction</td>
+    <td>armeabi-v7a</td>
+    <td>tensorflow-lite.aar (375,813 bytes)<br />tensorflow-lite-select-tf-ops.aar (1,676,380 bytes)</td>
+  </tr>
+   <tr>
+    <td>arm64-v8a</td>
+    <td>tensorflow-lite.aar (421,826 bytes)<br />tensorflow-lite-select-tf-ops.aar (2,298,630 bytes)</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://tfhub.dev/deepmind/i3d-kinetics-400/1">i3d-kinetics-400</a>
+    </td>
+    <td rowspan = 2>Video classification</td>
+    <td>armeabi-v7a</td>
+    <td>tensorflow-lite.aar (240,085 bytes)<br />tensorflow-lite-select-tf-ops.aar (1,708,597 bytes)</td>
+  </tr>
+   <tr>
+    <td>arm64-v8a</td>
+    <td>tensorflow-lite.aar (273,713 bytes)<br />tensorflow-lite-select-tf-ops.aar (2,339,697 bytes)</td>
+  </tr>
+ </table>
+
+Note: This feature is currently experimental and available since version 2.4 and
+may change.
+
+## Known issues/limitations
+
+1.  Selective Build for C API and iOS version is not supported currently.
+
+## Selectively build TensorFlow Lite with Bazel
+
+This section assumes that you have downloaded TensorFlow source codes and
+[set up the local development environment](https://www.tensorflow.org/lite/guide/android#build_tensorflow_lite_locally)
+to Bazel.
+
+### Build AAR files for Android project
+
+You can build the custom TensorFlow Lite AARs by providing your model file paths
+as follows.
+
+```sh
+sh tensorflow/lite/tools/build_aar.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a
+```
+
+The above command will generate the AAR file `bazel-bin/tmp/tensorflow-lite.aar`
+for TensorFlow Lite built-in and custom ops; and optionally, generates the aar
+file `bazel-bin/tmp/tensorflow-lite-select-tf-ops.aar` if your models contain
+Select TensorFlow ops. Note that this builds a "fat" AAR with several different
+architectures; if you don't need all of them, use the subset appropriate for
+your deployment environment.
+
+### Advanced Usage: Build with custom ops
+
+If you have developed Tensorflow Lite models with custom ops, you can build them
+by adding the following flags to the build command:
+
+```sh
+sh tensorflow/lite/tools/build_aar.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --tflite_custom_ops_srcs=/e/f/file1.cc,/g/h/file2.h \
+  --tflite_custom_ops_deps=dep1,dep2
+```
+
+The `tflite_custom_ops_srcs` flag contains source files of your custom ops and
+the `tflite_custom_ops_deps` flag contains dependencies to build those source
+files. Note that these dependencies must exist in the TensorFlow repo.
+
+## Selectively Build TensorFlow Lite with Docker
+
+This section assumes that you have installed
+[Docker](https://docs.docker.com/get-docker/) on your local machine and
+[built the TensorFlow Lite docker file](https://www.tensorflow.org/lite/guide/android#set_up_build_environment_using_docker).
+
+### Build AAR files for Android project
+
+Download the script for building with Docker by running:
+
+```sh
+curl -o build_aar_with_docker.sh \
+  https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/tools/build_aar_with_docker.sh &&
+chmod +x build_aar_with_docker.sh
+```
+
+Then, you can build the custom TensorFlow Lite AAR by providing your model file
+paths as follows.
+
+```sh
+sh build_aar_with_docker.sh \
+  --input_models=/a/b/model_one.tflite,/c/d/model_two.tflite \
+  --target_archs=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --checkpoint=master
+```
+
+The `checkpoint` flag is a commit, a branch or a tag of the TensorFlow repo that
+you want to checkout before building the libraries. The above command will
+generate the AAR file `tensorflow-lite.aar` for TensorFlow Lite built-in and
+custom ops and optionally the AAR file `tensorflow-lite-select-tf-ops.aar` for
+Select TensorFlow ops in your current directory.
+
+## Add AAR files to project
+
+Add AAR files by directly
+[importing the AAR into your project](https://www.tensorflow.org/lite/guide/android#add_aar_directly_to_project),
+or by
+[publishing the custom AAR to your local Maven repository](https://www.tensorflow.org/lite/guide/android#install_aar_to_local_maven_repository).
+Note that you have to add the AAR files for `tensorflow-lite-select-tf-ops.aar`
+as well if you generate it.
diff --git a/tensorflow/lite/g3doc/guide/roadmap.md b/tensorflow/lite/g3doc/guide/roadmap.md
index b762db12c44..7adb2d1b3ba 100644
--- a/tensorflow/lite/g3doc/guide/roadmap.md
+++ b/tensorflow/lite/g3doc/guide/roadmap.md
@@ -37,6 +37,13 @@ roadmap and provide us feedback in the
 *   **More models and examples**
     *   More examples to demonstrate model usage as well as new features and
         APIs, covering different platforms.
+*   **Task Library**
+    *   Improve the usability of the C++ Task Library, such as providing
+        prebuilt binaries and creating user-friendly workflows for users who
+        want to build from source code.
+    *   Release reference examples of using the Task Library.
+    *   Enable more task types.
+    *   Improve cross-platform support and enable more tasks for iOS.
 
 ## Performance
 
diff --git a/tensorflow/lite/g3doc/images/convert/workflow.svg b/tensorflow/lite/g3doc/images/convert/workflow.svg
index c0c45628952..727f6a1dbfb 100644
--- a/tensorflow/lite/g3doc/images/convert/workflow.svg
+++ b/tensorflow/lite/g3doc/images/convert/workflow.svg
@@ -1 +1 @@
-<svg version="1.1" viewBox="0.0 0.0 620.0 380.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m287.5871 289.67102q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.764435 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.130188 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path fill="#434343" d="m129.30531 52.13155l-1.0 0l0 -7.6875l-2.7031326 0l0 -0.875l6.4218826 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.6132812 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.841797 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path fill="#434343" d="m356.54068 291.73602q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710938 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.480438 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649445 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.6022644 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m155.70459 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m147.59514 262.88098q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#93c47d" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.16873 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m245.60614 263.47473l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.015671 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968796 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718811 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.8594055 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.578186 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.5625305 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.015686 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.0625305 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.06418 249.4511l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m371.76718 263.4511l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.26564026 0.171875 -0.43751526q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.43751526q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.8750305 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.468811 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.7344055 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.015686 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.6832 210.09796l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.15686 248.65422q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.87501526q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.1562653q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.3750153l-1.0 0l0 -8.562515l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84376526 -2.421875 0.84376526q-1.546875 0 -2.390625 -0.84376526q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m214.95276 251.95473l15.968506 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m214.95276 251.95473l12.541412 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m227.49417 251.95473l-1.124588 1.124588l3.0897675 -1.124588l-3.0897675 -1.124588z" fill-rule="evenodd"/><path fill="#d9ead3" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.0735 52.719162l77.480316 0l0 31.748032l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m394.0891 68.87978q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.021 53.345146l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.19287 65.30264l2.421875 0q1.703125 0 2.46875 0.515625q0.765625 0.5 0.765625 1.59375q0 0.765625 -0.421875 1.265625q-0.421875 0.5 -1.25 0.640625l0 0.0625q1.953125 0.328125 1.953125 2.046875q0 1.140625 -0.78125 1.796875q-0.765625 0.640625 -2.15625 0.640625l-3.0 0l0 -8.5625zm1.0 3.65625l1.640625 0q1.0625 0 1.515625 -0.328125q0.46875 -0.328125 0.46875 -1.109375q0 -0.71875 -0.515625 -1.03125q-0.515625 -0.328125 -1.640625 -0.328125l-1.46875 0l0 2.796875zm0 0.84375l0 3.21875l1.796875 0q1.03125 0 1.546875 -0.40625q0.53125 -0.40625 0.53125 -1.265625q0 -0.796875 -0.546875 -1.171875q-0.53125 -0.375 -1.625 -0.375l-1.703125 0zm10.587891 4.0625l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.486328 0.8125q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm4.1191406 -3.40625q0.25 -0.359375 0.765625 -0.9375l2.0625 -2.203125l1.15625 0l-2.59375 2.734375l2.78125 3.6875l-1.171875 0l-2.28125 -3.03125l-0.71875 0.625l0 2.40625l-0.96875 0l0 -9.125l0.96875 0l0 4.84375q0 0.3125 -0.046875 1.0l0.046875 0zm8.048828 3.40625q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.3496094 -0.859375l-0.0625 0q-0.671875 0.984375 -2.015625 0.984375q-1.25 0 -1.953125 -0.859375q-0.703125 -0.875 -0.703125 -2.453125q0 -1.59375 0.703125 -2.46875q0.703125 -0.890625 1.953125 -0.890625q1.3125 0 2.015625 0.953125l0.078125 0l-0.046875 -0.453125l-0.03125 -0.453125l0 -2.625l0.984375 0l0 9.125l-0.796875 0l-0.125 -0.859375zm-1.953125 0.15625q1.0 0 1.4375 -0.53125q0.453125 -0.546875 0.453125 -1.75l0 -0.203125q0 -1.375 -0.453125 -1.953125q-0.453125 -0.578125 -1.4375 -0.578125q-0.859375 0 -1.3125 0.671875q-0.453125 0.65625 -0.453125 1.859375q0 1.234375 0.4375 1.859375q0.453125 0.625 1.328125 0.625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m51.586056 162.60596q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.1250496 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#000000" d="m55.711124 171.62158l0.796875 0l0 0.546875q0.1875 -0.3125 0.484375 -0.484375q0.3125 -0.1875 0.625 -0.1875q0.359375 0 0.625 0.234375q0.28125 0.21875 0.359375 0.5625q0.140625 -0.359375 0.46875 -0.578125q0.34375 -0.21875 0.765625 -0.21875q0.53125 0 0.796875 0.390625q0.28125 0.375 0.25 1.0l0 4.21875l-0.78125 0l0 -3.890625q0 -0.6875 -0.140625 -0.890625q-0.125 -0.203125 -0.390625 -0.203125q-0.203125 0 -0.40625 0.203125q-0.203125 0.203125 -0.34375 0.53125q-0.125 0.3125 -0.125 0.625l0 3.625l-0.796875 0l0 -3.8125q0 -0.671875 -0.125 -0.90625q-0.125 -0.234375 -0.46875 -0.234375q-0.1875 0 -0.375 0.171875q-0.1875 0.15625 -0.3125 0.453125q-0.109375 0.28125 -0.109375 0.671875l0 3.65625l-0.796875 0l0 -5.484375zm8.546925 5.578125q-0.7343788 0 -1.3125038 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125038 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.7031288 0 -1.1718788 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.8437538 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#f4cccc" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path fill="#000000" d="m101.15734 83.234566q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.125046 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640663 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.8749924 0l0 -5.5l0.9062424 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path fill="#000000" d="m187.2167 83.73588l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.955078 -6.5q1.109375 0 1.75 0.65625q0.65625 0.65625 0.65625 1.8125q0 1.09375 -0.421875 2.078125q-0.421875 0.984375 -1.140625 1.515625q-0.71875 0.53125 -1.625 0.53125q-1.125 0 -1.765625 -0.65625q-0.640625 -0.671875 -0.640625 -1.8125q0 -1.125 0.421875 -2.09375q0.4375 -0.984375 1.15625 -1.5q0.734375 -0.53125 1.609375 -0.53125zm1.390625 2.328125q0 -0.671875 -0.375 -1.078125q-0.359375 -0.421875 -0.984375 -0.421875q-0.640625 0 -1.15625 0.4375q-0.5 0.421875 -0.78125 1.203125q-0.28125 0.765625 -0.28125 1.703125q0 0.765625 0.375 1.1875q0.375 0.421875 1.078125 0.421875q0.609375 0 1.09375 -0.421875q0.484375 -0.4375 0.75 -1.21875q0.28125 -0.796875 0.28125 -1.8125zm6.451172 4.171875l-0.1875 -3.53125q-0.03125 -0.546875 -0.03125 -1.0l0 -0.921875l-0.046875 0l-0.296875 0.6875l-0.484375 1.109375l-1.703125 3.65625l-1.1875 0l-0.25 -6.421875l0.953125 0l0.109375 3.484375l0 0.515625q0 0.859375 -0.046875 1.578125l0.03125 0q0.28125 -0.734375 0.8125 -1.890625l1.71875 -3.6875l1.078125 0l0.21875 3.484375q0.03125 0.984375 0.03125 1.53125l0 0.3125l-0.015625 0.25l0.03125 0q0.171875 -0.515625 0.484375 -1.28125q0.328125 -0.78125 1.90625 -4.296875l1.03125 0l-2.953125 6.421875l-1.203125 0zm8.15625 0l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm4.625 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.734375 -6.421875l0.984375 0l0.375 3.59375q0.140625 1.515625 0.140625 2.125l0.03125 0q0.75 -1.625 1.046875 -2.1875l1.90625 -3.53125l1.046875 0l-3.46875 6.421875l-1.328125 0zm7.1210938 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm9.6171875 -2.71875l-2.90625 0l-1.4375 2.71875l-1.109375 0l4.6875 -8.5625l1.015625 0l1.078125 8.5625l-1.0 0l-0.328125 -2.71875zm-0.109375 -0.921875l-0.203125 -1.75q-0.140625 -1.046875 -0.171875 -2.046875q-0.21875 0.515625 -0.46875 1.03125q-0.25 0.5 -1.46875 2.765625l2.3125 0zm8.802734 -2.71875q0 1.453125 -0.96875 2.21875q-0.953125 0.765625 -2.8125 0.765625l-0.796875 0l-0.71875 3.375l-1.0 0l1.8125 -8.5625l1.9375 0q1.25 0 1.890625 0.5625q0.65625 0.546875 0.65625 1.640625zm-4.390625 2.140625l0.78125 0q1.265625 0 1.921875 -0.53125q0.65625 -0.546875 0.65625 -1.578125q0 -0.734375 -0.40625 -1.046875q-0.40625 -0.328125 -1.25 -0.328125l-0.96875 0l-0.734375 3.484375zm4.8847656 4.21875l1.828125 -8.5625l0.984375 0l-1.828125 8.5625l-0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m527.26624 172.71358l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m535.1793 130.2198q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m70.25812 181.08202l0 27.68837l100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.25812 181.08202l0 27.68837l100.94487 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20299 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 0.06298828l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 251.9941l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -41.858276l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -41.858276l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 210.07283l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -83.74803l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -83.74803l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 168.18307l-1.1245728 1.1245728l3.0897522 -1.1245728l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -125.6063l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -125.6063l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 126.32481l-1.1245728 1.1245804l3.0897522 -1.1245804l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#93c47d" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path fill="#000000" d="m144.78352 165.98096q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#d9ead3" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m251.54225 162.5122q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.6094055 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.171936 -0.3125l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m240.08907 177.10596l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm7.718796 0.125q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm4.203171 -5.609375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.43748474 0.546875 0.43748474 1.71875l0 3.328125l-0.84373474 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.3906555 3.53125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.875061 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm6.0156555 6.734375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm9.8750305 7.578125q-0.984375 -0.46875 -1.734375 -1.265625q-0.734375 -0.78125 -1.140625 -1.765625q-0.390625 -1.0 -0.390625 -2.0625q0 -1.0625 0.390625 -2.03125q0.390625 -0.96875 1.109375 -1.734375q0.734375 -0.765625 1.703125 -1.21875l0.21875 0.75q-1.171875 0.640625 -1.875 1.796875q-0.703125 1.140625 -0.703125 2.46875q0 1.34375 0.71875 2.53125q0.71875 1.171875 1.9375 1.8125l-0.234375 0.71875zm4.656311 -5.328125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125zm6.2031555 2.953125q1.203125 -0.640625 1.921875 -1.8125q0.71875 -1.1875 0.71875 -2.53125q0 -1.328125 -0.703125 -2.46875q-0.703125 -1.15625 -1.875 -1.796875l0.234375 -0.75q0.953125 0.453125 1.671875 1.21875q0.734375 0.765625 1.125 1.734375q0.40625 0.96875 0.40625 2.03125q0 1.0625 -0.40625 2.0625q-0.40625 0.984375 -1.15625 1.765625q-0.734375 0.796875 -1.71875 1.265625l-0.21875 -0.71875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m239.93134 94.31036l0 27.88977l32.22049 0l0 27.889755" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m239.93135 94.31036l0 27.88977l32.220474 0l0 24.462677" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m272.15182 146.66281l-1.1245728 -1.124588l1.1245728 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.20563 181.08202l0 55.370087" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.20561 181.08202l0 51.942993" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20561 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.15182 181.08202l0 27.68837l-100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m272.15182 181.08202l0 27.68837l-100.9449 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20692 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m220.43396 94.71982l0 27.637794l-38.929123 0l0 27.637794" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m220.43398 94.71982l0 27.637794l-38.92914 0l0 24.210716" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m181.50484 146.56831l-1.124588 -1.1245728l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 27.401573" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 23.974495" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m160.49081 146.0867l-1.124588 -1.124588l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 27.90551" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 24.478432" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m70.25812 146.66281l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m318.4169 251.95473l41.385834 -0.03149414" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m318.4169 251.95473l37.95877 -0.028900146" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m356.37564 251.92584l-1.1237183 1.1254272l3.0889282 -1.1269226l-3.0906372 -1.1222382z" fill-rule="evenodd"/><path fill="#93c47d" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.56693 10.257217l87.49606 0l0 31.748032l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.7388 30.777218l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -6.421875l0.96875 0l0 6.421875zm-1.046875 -8.15625q0 -0.34375 0.15625 -0.5q0.171875 -0.15625 0.421875 -0.15625q0.234375 0 0.390625 0.171875q0.171875 0.15625 0.171875 0.484375q0 0.328125 -0.171875 0.484375q-0.15625 0.15625 -0.390625 0.15625q-0.25 0 -0.421875 -0.15625q-0.15625 -0.15625 -0.15625 -0.484375zm4.0820312 8.15625l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm4.7851562 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm10.287109 0.046875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm6.7226562 2.453125q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm10.1484375 6.546875l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.015625 -0.390625q-0.90625 0 -1.34375 0.53125q-0.4375 0.515625 -0.4375 1.59375l0 3.59375l-0.96875 0l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.03125 -0.390625q-0.90625 0 -1.34375 0.546875q-0.421875 0.546875 -0.421875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.265625 -0.46875 0.765625 -0.734375q0.515625 -0.265625 1.125 -0.265625q1.515625 0 1.96875 1.09375l0.046875 0q0.296875 -0.5 0.828125 -0.796875q0.546875 -0.296875 1.25 -0.296875q1.09375 0 1.625 0.5625q0.546875 0.5625 0.546875 1.796875l0 4.1875l-0.96875 0zm6.9277344 0l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.51706 10.829396l100.00003 0l0 31.748032l-100.00003 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.68893 31.349398l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.5957336 0l0 -4.15625q0 -0.78125 -0.35940552 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.1719055 0 1.7500305 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/></g></svg>
+<svg version="1.1" viewBox="0.0 0.0 620.0 310.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.700788 11.509187l317.00787 0l0 292.31497l-317.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m287.5871 289.67102q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.764435 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.130188 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024445 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m70.40157 31.61155l201.6063 0l0 69.98425l-201.6063 0z" fill-rule="evenodd"/><path fill="#434343" d="m129.30531 52.13155l-1.0 0l0 -7.6875l-2.7031326 0l0 -0.875l6.4218826 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.6132812 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.841797 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.54068 100.7874l249.0079 0l0 203.02364l-249.0079 0z" fill-rule="evenodd"/><path fill="#434343" d="m356.54068 291.73602q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710938 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.480438 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649445 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.6022644 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.456696 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m155.70459 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m147.59514 262.88098q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#93c47d" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.92084 236.45866l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.16873 249.47473l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m245.60614 263.47473l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.015671 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968796 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718811 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.8594055 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.578186 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.5625305 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.015686 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.0625305 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.81628 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.06418 249.4511l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m371.76718 263.4511l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.26564026 0.171875 -0.43751526q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.43751526q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.8750305 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.468811 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.7344055 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.015686 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640686 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 194.56627l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.6832 210.09796l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 236.43504l87.49606 0l0 30.992111l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.15686 248.65422q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.87501526q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.1562653q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.3750153l-1.0 0l0 -8.562515l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84376526 -2.421875 0.84376526q-1.546875 0 -2.390625 -0.84376526q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m214.95276 251.95473l15.968506 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m214.95276 251.95473l12.541412 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m227.49417 251.95473l-1.124588 1.124588l3.0897675 -1.124588l-3.0897675 -1.124588z" fill-rule="evenodd"/><path fill="#d9ead3" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.5748 58.32546l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.0735 52.719162l77.480316 0l0 31.748032l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m394.0891 68.87978q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 58.951443l20.53543 0l0 20.535435l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.021 53.345146l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.19287 65.30264l2.421875 0q1.703125 0 2.46875 0.515625q0.765625 0.5 0.765625 1.59375q0 0.765625 -0.421875 1.265625q-0.421875 0.5 -1.25 0.640625l0 0.0625q1.953125 0.328125 1.953125 2.046875q0 1.140625 -0.78125 1.796875q-0.765625 0.640625 -2.15625 0.640625l-3.0 0l0 -8.5625zm1.0 3.65625l1.640625 0q1.0625 0 1.515625 -0.328125q0.46875 -0.328125 0.46875 -1.109375q0 -0.71875 -0.515625 -1.03125q-0.515625 -0.328125 -1.640625 -0.328125l-1.46875 0l0 2.796875zm0 0.84375l0 3.21875l1.796875 0q1.03125 0 1.546875 -0.40625q0.53125 -0.40625 0.53125 -1.265625q0 -0.796875 -0.546875 -1.171875q-0.53125 -0.375 -1.625 -0.375l-1.703125 0zm10.587891 4.0625l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.486328 0.8125q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm4.1191406 -3.40625q0.25 -0.359375 0.765625 -0.9375l2.0625 -2.203125l1.15625 0l-2.59375 2.734375l2.78125 3.6875l-1.171875 0l-2.28125 -3.03125l-0.71875 0.625l0 2.40625l-0.96875 0l0 -9.125l0.96875 0l0 4.84375q0 0.3125 -0.046875 1.0l0.046875 0zm8.048828 3.40625q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.3496094 -0.859375l-0.0625 0q-0.671875 0.984375 -2.015625 0.984375q-1.25 0 -1.953125 -0.859375q-0.703125 -0.875 -0.703125 -2.453125q0 -1.59375 0.703125 -2.46875q0.703125 -0.890625 1.953125 -0.890625q1.3125 0 2.015625 0.953125l0.078125 0l-0.046875 -0.453125l-0.03125 -0.453125l0 -2.625l0.984375 0l0 9.125l-0.796875 0l-0.125 -0.859375zm-1.953125 0.15625q1.0 0 1.4375 -0.53125q0.453125 -0.546875 0.453125 -1.75l0 -0.203125q0 -1.375 -0.453125 -1.953125q-0.453125 -0.578125 -1.4375 -0.578125q-0.859375 0 -1.3125 0.671875q-0.453125 0.65625 -0.453125 1.859375q0 1.234375 0.4375 1.859375q0.453125 0.625 1.328125 0.625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m26.510082 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m51.586056 162.60596q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.1250496 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#000000" d="m55.711124 171.62158l0.796875 0l0 0.546875q0.1875 -0.3125 0.484375 -0.484375q0.3125 -0.1875 0.625 -0.1875q0.359375 0 0.625 0.234375q0.28125 0.21875 0.359375 0.5625q0.140625 -0.359375 0.46875 -0.578125q0.34375 -0.21875 0.765625 -0.21875q0.53125 0 0.796875 0.390625q0.28125 0.375 0.25 1.0l0 4.21875l-0.78125 0l0 -3.890625q0 -0.6875 -0.140625 -0.890625q-0.125 -0.203125 -0.390625 -0.203125q-0.203125 0 -0.40625 0.203125q-0.203125 0.203125 -0.34375 0.53125q-0.125 0.3125 -0.125 0.625l0 3.625l-0.796875 0l0 -3.8125q0 -0.671875 -0.125 -0.90625q-0.125 -0.234375 -0.46875 -0.234375q-0.1875 0 -0.375 0.171875q-0.1875 0.15625 -0.3125 0.453125q-0.109375 0.28125 -0.109375 0.671875l0 3.65625l-0.796875 0l0 -5.484375zm8.546925 5.578125q-0.7343788 0 -1.3125038 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125038 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.7031288 0 -1.1718788 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.8437538 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#f4cccc" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m76.08137 63.718502l87.496056 0l0 30.99213l-87.496056 0z" fill-rule="evenodd"/><path fill="#000000" d="m101.15734 83.234566q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.125046 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640663 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.8749924 0l0 -5.5l0.9062424 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.69817 63.71982l87.49605 0l0 30.992126l-87.49605 0z" fill-rule="evenodd"/><path fill="#000000" d="m187.2167 83.73588l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.955078 -6.5q1.109375 0 1.75 0.65625q0.65625 0.65625 0.65625 1.8125q0 1.09375 -0.421875 2.078125q-0.421875 0.984375 -1.140625 1.515625q-0.71875 0.53125 -1.625 0.53125q-1.125 0 -1.765625 -0.65625q-0.640625 -0.671875 -0.640625 -1.8125q0 -1.125 0.421875 -2.09375q0.4375 -0.984375 1.15625 -1.5q0.734375 -0.53125 1.609375 -0.53125zm1.390625 2.328125q0 -0.671875 -0.375 -1.078125q-0.359375 -0.421875 -0.984375 -0.421875q-0.640625 0 -1.15625 0.4375q-0.5 0.421875 -0.78125 1.203125q-0.28125 0.765625 -0.28125 1.703125q0 0.765625 0.375 1.1875q0.375 0.421875 1.078125 0.421875q0.609375 0 1.09375 -0.421875q0.484375 -0.4375 0.75 -1.21875q0.28125 -0.796875 0.28125 -1.8125zm6.451172 4.171875l-0.1875 -3.53125q-0.03125 -0.546875 -0.03125 -1.0l0 -0.921875l-0.046875 0l-0.296875 0.6875l-0.484375 1.109375l-1.703125 3.65625l-1.1875 0l-0.25 -6.421875l0.953125 0l0.109375 3.484375l0 0.515625q0 0.859375 -0.046875 1.578125l0.03125 0q0.28125 -0.734375 0.8125 -1.890625l1.71875 -3.6875l1.078125 0l0.21875 3.484375q0.03125 0.984375 0.03125 1.53125l0 0.3125l-0.015625 0.25l0.03125 0q0.171875 -0.515625 0.484375 -1.28125q0.328125 -0.78125 1.90625 -4.296875l1.03125 0l-2.953125 6.421875l-1.203125 0zm8.15625 0l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm4.625 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.734375 -6.421875l0.984375 0l0.375 3.59375q0.140625 1.515625 0.140625 2.125l0.03125 0q0.75 -1.625 1.046875 -2.1875l1.90625 -3.53125l1.046875 0l-3.46875 6.421875l-1.328125 0zm7.1210938 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm3.5273438 5.703125l-0.984375 0l1.953125 -9.125l0.984375 0l-1.953125 9.125zm9.6171875 -2.71875l-2.90625 0l-1.4375 2.71875l-1.109375 0l4.6875 -8.5625l1.015625 0l1.078125 8.5625l-1.0 0l-0.328125 -2.71875zm-0.109375 -0.921875l-0.203125 -1.75q-0.140625 -1.046875 -0.171875 -2.046875q-0.21875 0.515625 -0.46875 1.03125q-0.25 0.5 -1.46875 2.765625l2.3125 0zm8.802734 -2.71875q0 1.453125 -0.96875 2.21875q-0.953125 0.765625 -2.8125 0.765625l-0.796875 0l-0.71875 3.375l-1.0 0l1.8125 -8.5625l1.9375 0q1.25 0 1.890625 0.5625q0.65625 0.546875 0.65625 1.640625zm-4.390625 2.140625l0.78125 0q1.265625 0 1.921875 -0.53125q0.65625 -0.546875 0.65625 -1.578125q0 -0.734375 -0.40625 -1.046875q-0.40625 -0.328125 -1.25 -0.328125l-0.96875 0l-0.734375 3.484375zm4.8847656 4.21875l1.828125 -8.5625l0.984375 0l-1.828125 8.5625l-0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 152.69751l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m527.26624 172.71358l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.33072 110.82874l87.49606 0l0 30.992119l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m535.1793 130.2198q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m70.25812 181.08202l0 27.68837l100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.25812 181.08202l0 27.68837l100.94487 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20299 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 0.06298828l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 251.9941l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -41.858276l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -41.858276l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 210.07283l-1.1245728 1.124588l3.0897522 -1.124588l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -83.74803l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -83.74803l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 168.18307l-1.1245728 1.1245728l3.0897522 -1.1245728l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m447.31235 251.9311l24.009003 0l0 -125.6063l24.022491 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m447.31235 251.9311l24.009003 0l0 -125.6063l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m491.91675 126.32481l-1.1245728 1.1245804l3.0897522 -1.1245804l-3.0897522 -1.124588z" fill-rule="evenodd"/><path fill="#93c47d" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.45759 150.08989l87.49607 0l0 30.992126l-87.49607 0z" fill-rule="evenodd"/><path fill="#000000" d="m144.78352 165.98096q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#d9ead3" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.40378 150.08989l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m251.54225 162.5122q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.6094055 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.171936 -0.3125l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.2812805 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.859436 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m240.08907 177.10596l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm7.718796 0.125q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm4.203171 -5.609375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.43748474 0.546875 0.43748474 1.71875l0 3.328125l-0.84373474 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm8.609421 4.84375q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm6.3906555 3.53125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.875061 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm6.0156555 6.734375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.859436 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm9.8750305 7.578125q-0.984375 -0.46875 -1.734375 -1.265625q-0.734375 -0.78125 -1.140625 -1.765625q-0.390625 -1.0 -0.390625 -2.0625q0 -1.0625 0.390625 -2.03125q0.390625 -0.96875 1.109375 -1.734375q0.734375 -0.765625 1.703125 -1.21875l0.21875 0.75q-1.171875 0.640625 -1.875 1.796875q-0.703125 1.140625 -0.703125 2.46875q0 1.34375 0.71875 2.53125q0.71875 1.171875 1.9375 1.8125l-0.234375 0.71875zm4.656311 -5.328125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125zm6.2031555 2.953125q1.203125 -0.640625 1.921875 -1.8125q0.71875 -1.1875 0.71875 -2.53125q0 -1.328125 -0.703125 -2.46875q-0.703125 -1.15625 -1.875 -1.796875l0.234375 -0.75q0.953125 0.453125 1.671875 1.21875q0.734375 0.765625 1.125 1.734375q0.40625 0.96875 0.40625 2.03125q0 1.0625 -0.40625 2.0625q-0.40625 0.984375 -1.15625 1.765625q-0.734375 0.796875 -1.71875 1.265625l-0.21875 -0.71875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m239.93134 94.31036l0 27.88977l32.22049 0l0 27.889755" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m239.93135 94.31036l0 27.88977l32.220474 0l0 24.462677" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m272.15182 146.66281l-1.1245728 -1.124588l1.1245728 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.20563 181.08202l0 55.370087" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.20561 181.08202l0 51.942993" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20561 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m272.15182 181.08202l0 27.68837l-100.944885 0l0 27.681717" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m272.15182 181.08202l0 27.68837l-100.9449 0l0 24.254623" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m171.20692 233.02501l-1.1245728 -1.1245728l1.1245728 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m220.43396 94.71982l0 27.637794l-38.929123 0l0 27.637794" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m220.43398 94.71982l0 27.637794l-38.92914 0l0 24.210716" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m181.50484 146.56831l-1.124588 -1.1245728l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 27.401573" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m119.8294 94.71063l0 27.401573l40.661415 0l0 23.974495" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m160.49081 146.0867l-1.124588 -1.124588l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 27.90551" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.990395 94.27887l0 27.90551l-29.732277 0l0 24.478432" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m70.25812 146.66281l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m318.4169 251.95473l41.385834 -0.03149414" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m318.4169 251.95473l37.95877 -0.028900146" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m356.37564 251.92584l-1.1237183 1.1254272l3.0889282 -1.1269226l-3.0906372 -1.1222382z" fill-rule="evenodd"/><path fill="#93c47d" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m352.57086 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m377.56693 10.257217l87.49606 0l0 31.748032l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m388.7388 30.777218l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -6.421875l0.96875 0l0 6.421875zm-1.046875 -8.15625q0 -0.34375 0.15625 -0.5q0.171875 -0.15625 0.421875 -0.15625q0.234375 0 0.390625 0.171875q0.171875 0.15625 0.171875 0.484375q0 0.328125 -0.171875 0.484375q-0.15625 0.15625 -0.390625 0.15625q-0.25 0 -0.421875 -0.15625q-0.15625 -0.15625 -0.15625 -0.484375zm4.0820312 8.15625l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm4.7851562 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm10.287109 0.046875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm6.7226562 2.453125q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm10.1484375 6.546875l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.015625 -0.390625q-0.90625 0 -1.34375 0.53125q-0.4375 0.515625 -0.4375 1.59375l0 3.59375l-0.96875 0l0 -4.171875q0 -0.78125 -0.328125 -1.15625q-0.328125 -0.390625 -1.03125 -0.390625q-0.90625 0 -1.34375 0.546875q-0.421875 0.546875 -0.421875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.265625 -0.46875 0.765625 -0.734375q0.515625 -0.265625 1.125 -0.265625q1.515625 0 1.96875 1.09375l0.046875 0q0.296875 -0.5 0.828125 -0.796875q0.546875 -0.296875 1.25 -0.296875q1.09375 0 1.625 0.5625q0.546875 0.5625 0.546875 1.796875l0 4.1875l-0.96875 0zm6.9277344 0l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m469.5223 15.863517l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m494.51706 10.829396l100.00003 0l0 31.748032l-100.00003 0z" fill-rule="evenodd"/><path fill="#000000" d="m504.68893 31.349398l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.5957336 0l0 -4.15625q0 -0.78125 -0.35940552 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.1719055 0 1.7500305 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/></g></svg>
diff --git a/tensorflow/lite/g3doc/guide/codegen.md b/tensorflow/lite/g3doc/inference_with_metadata/codegen.md
similarity index 50%
rename from tensorflow/lite/g3doc/guide/codegen.md
rename to tensorflow/lite/g3doc/inference_with_metadata/codegen.md
index 84dd2ffade9..b447573da41 100644
--- a/tensorflow/lite/g3doc/guide/codegen.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/codegen.md
@@ -1,20 +1,4 @@
-# Integrate TensorFlow Lite models with metadata
-
-[TensorFlow Lite metadata](../convert/metadata.md) contains a rich description
-of what the model does and how to use the model. It can empower code generators,
-such as the
-[TensorFlow Lite Android code generator](#generate-code-with-tensorflow-lite-android-code-generator)
-and the
-[Android Studio ML Binding feature](#generate-code-with-android-studio-ml-model-binding),
-to automatically generates the inference code for you. It can also be used to
-configure your custom inference pipeline.
-
-Browse
-[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
-and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
-pretrained models with metadata. All image models have been supported.
-
-## Generate code with TensorFlow Lite Android code generator
+# Generate model interfaces with TensorFlow Lite code generator
 
 Note: TensorFlow Lite wrapper code generator currently only supports Android.
 
@@ -30,7 +14,7 @@ under relevant fields in
 [metadata_schema.fbs](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs),
 to see how the codegen tool parses each field.
 
-### Generate Wrapper Code
+## Generate wrapper Code
 
 You will need to install the following tooling in your terminal:
 
@@ -53,17 +37,17 @@ environment, it maybe easier to zip up the result in a zip archive and download
 it to your Android Studio project:
 
 ```python
-## Zip up the generated code
+# Zip up the generated code
 !zip -r classify_wrapper.zip classify_wrapper/
 
-## Kick off the download
+# Download the archive
 from google.colab import files
 files.download('classify_wrapper.zip')
 ```
 
-### Using the generated code
+## Using the generated code
 
-#### Step 1: Import the generated code
+### Step 1: Import the generated code
 
 Unzip the generated code if necessary into a directory structure. The root of
 the generated code is assumed to be `SRC_ROOT`.
@@ -75,7 +59,7 @@ select `SRC_ROOT`
 Using the above example, the directory and the module imported would be called
 `classify_wrapper`.
 
-#### Step 2: Update the app's `build.gradle` file
+### Step 2: Update the app's `build.gradle` file
 
 In the app module that will be consuming the generated library module:
 
@@ -93,7 +77,7 @@ Under the dependencies section, add the following:
 implementation project(":classify_wrapper")
 ```
 
-#### Step 3: Using the model
+### Step 3: Using the model
 
 ```java
 // 1. Initialize the model
@@ -119,7 +103,7 @@ if(null != myImageClassifier) {
 }
 ```
 
-### Accelerating model inference
+## Accelerating model inference
 
 The generated code provides a way for developers to accelerate their code
 through the use of [delegates](../performance/delegates.md) and the number of
@@ -143,12 +127,11 @@ try {
 }
 ```
 
-### Troubleshooting
+## Troubleshooting
 
-#### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
-
-Under the app module that will uses the library module, insert the following
-lines under the android section:
+If you get a 'java.io.FileNotFoundException: This file can not be opened as a
+file descriptor; it is probably compressed' error, insert the following lines
+under the android section of the app module that will uses the library module:
 
 ```build
 aaptOptions {
@@ -168,76 +151,3 @@ for more details.
 Note: Code generated by the TensorFlow Lite Android code generator may include
 some latest API or experimental features, which can be a super set of the one
 generated by the Android Studio ML Model Binding.
-
-## Read the metadata from models
-
-The Metadata Extractor library is a convenient tool to read the metadata and
-associated files from a models across different platforms (see the
-[Java version](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/metadata)
-and the C++ version is coming soon). Users can also build their own metadata
-extractor tool in other languages using the Flatbuffers library.
-
-### Read the metadata in Java
-
-Note: the Java Metadata Extractor library is available as an Android library
-dependency: `org.tensorflow:tensorflow-lite-metadata`.
-
-You can initialize a `MetadataExtractor` with a `ByteBuffer` that points to the
-model:
-
-```java
-public MetadataExtractor(ByteBuffer buffer);
-```
-
-The `ByteBuffer` must remain unchanged for the whole lifetime of the
-`MetadataExtractor`. The initialization may fail if the Flatbuffers file
-identifier of the model metadata does not match the one of the metadata parser.
-See [metadata versioning](../convert/metadata.md#metadata-versioning) for more
-information.
-
-As long as the file identifer is satisfied, the metadata extractor will not fail
-when reading metadata generated from an old or a future scheme due to the
-Flatbuffers forward and backwards compatibility mechanism. But fields from
-future schemas cannot be extracted by older metadata extractors. The
-[minimum necessary parser version](../convert/metadata.md#the-minimum-necessary-metadata-parser-version)
-of the metadata indicates the minimum version of metadata parser that can read
-the metadata Flatbuffers in full. You can use the following method to verify if
-the minimum necessary parser version is satisfied:
-
-```java
-public final boolean isMinimumParserVersionSatisfied();
-```
-
-It is allowed to pass in a model without metadata. However, invoking methods
-that read from the metadata will cause runtime errors. You can check if a model
-has metadata by invoking the method:
-
-```java
-public boolean hasMetadata();
-```
-
-`MetadataExtractor` provides convenient functions for you to get the
-input/output tensors' metadata. For example,
-
-```java
-public int getInputTensorCount();
-public TensorMetadata getInputTensorMetadata(int inputIndex);
-public QuantizationParams getInputTensorQuantizationParams(int inputIndex);
-public int[] getInputTensorShape(int inputIndex);
-public int getoutputTensorCount();
-public TensorMetadata getoutputTensorMetadata(int inputIndex);
-public QuantizationParams getoutputTensorQuantizationParams(int inputIndex);
-public int[] getoutputTensorShape(int inputIndex);
-```
-
-You can also read associated files through their names with the method:
-
-```java
-public InputStream getAssociatedFile(String fileName);
-```
-
-Though the
-[TensorFlow Lite model schema](https://github.com/tensorflow/tensorflow/blob/aa7ff6aa28977826e7acae379e82da22482b2bf2/tensorflow/lite/schema/schema.fbs#L1075)
-supports multiple subgraphs, the TFLite Interpreter only supports single
-subgraph so far. Therefore, `MetadataExtractor` omits subgraph index as an input
-in its methods.
diff --git a/tensorflow/lite/g3doc/guide/lite_support.md b/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
similarity index 98%
rename from tensorflow/lite/g3doc/guide/lite_support.md
rename to tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
index 39eeeee3684..e77a6f1a91d 100644
--- a/tensorflow/lite/g3doc/guide/lite_support.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
@@ -72,7 +72,7 @@ tImage = imageProcessor.process(tImage);
 ```
 
 `DataType` of a tensor can be read through the
-[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models)
+[metadata exractor library](../convert/metadata.md#read-the-metadata-from-models)
 as well as other model information.
 
 ### Create output objects and run the model
@@ -235,4 +235,4 @@ TensorBuffer dequantizedBuffer = probabilityProcessor.process(probabilityBuffer)
 ```
 
 The quantization parameters of a tensor can be read through the
-[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models).
+[metadata exractor library](../convert/metadata.md#read-the-metadata-from-models).
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/overview.md
new file mode 100644
index 00000000000..8caa92a6b68
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/overview.md
@@ -0,0 +1,51 @@
+# TensorFlow Lite inference with metadata
+
+Inferencing [models with metadata](../convert/metadata.md) can be as easy as
+just a few lines of code. TensorFlow Lite metadata contains a rich description
+of what the model does and how to use the model. It can empower code generators
+to automatically generate the inference code for you, such as using the
+[TensorFlow Lite Android code generator](codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+and the
+[Android Studio ML Binding feature](codegen.md#generate-code-with-android-studio-ml-model-binding).
+It can also be used to configure your custom inference pipeline.
+
+## Tools and libraries
+
+TensorFlow Lite provides varieties of tools and libraries to serve different
+tiers of deployment requirements as follows:
+
+### Generate model interface with the TensorFlow Lite Code Generator
+
+[TensorFlow Lite Code Generator](codegen.md) is an executable that generates
+model interface automatically based on the metadata. It currently supports
+Android with Java. The wrapper code removes the need to interact directly with
+`ByteBuffer`. Instead, developers can interact with the TensorFlow Lite model
+with typed objects such as `Bitmap` and `Rect`. Android Studio users can also
+get access to the codegen feature through
+[Android Studio ML Binding](codegen.md#generate-code-with-android-studio-ml-model-binding).
+
+### Leverage out-of-box APIs with the TensorFlow Lite Task Library
+
+[TensorFlow Lite Task Library](task_library/overview.md) provides optimized
+ready-to-use model interfaces for popular machine learning tasks, such as image
+classification, question and answer, etc. The model interfaces are specifically
+designed for each task to achieve the best performance and usability. Task
+Library works cross-platform and is supported on Java, C++, and Swift.
+
+### Build custom inference pipelines with the TensorFlow Lite Support Library
+
+[TensorFlow Lite Support Library](lite_support.md) is a cross-platform library
+that helps to customize model interface and build inference pipelines. It
+contains varieties of util methods and data structures to perform pre/post
+processing and data conversion. It is also designed to match the behavior of
+TensorFlow modules, such as TF.Image and TF.Text, ensuring consistency from
+training to inferencing.
+
+## Explore pretrained models with metadata
+
+Browse
+[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
+and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
+pretrained models with metadata for both vision and text tasks. Also see
+different options of
+[visualizing the metadata](../convert/metadata.md#visualize-the-metadata).
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
new file mode 100644
index 00000000000..02d6c3321eb
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
@@ -0,0 +1,122 @@
+# Bert natural language classifier
+
+The Task Library `BertNLClassifier` API is very similar to the `NLClassifier`
+that classifies input text into different categories, except that this API is
+specially tailored for Bert related models that require Wordpiece and
+Sentencepiece tokenizations outside the TFLite model.
+
+## Key features of the BertNLClassifier API
+
+*   Takes a single string as input, performs classification with the string and
+    outputs <Label, Score> pairs as classification results.
+
+*   Performs out-of-graph
+    [Wordpiece](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/text/tokenizers/bert_tokenizer.h)
+    or
+    [Sentencepiece](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/text/tokenizers/sentencepiece_tokenizer.h)
+    tokenizations on input text.
+
+## Supported BertNLClassifier models
+
+The following models are compatible with the `BertNLClassifier` API.
+
+*   Bert Models created by
+    [TensorFlow Lite Model Maker for text Classfication](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Text Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+### Step 2: Run inference using the API
+
+```java
+// Initialization
+BertNLClassifier classifier = BertNLClassifier.createFromFile(context, modelFile);
+
+// Run inference
+List<Category> results = classifier.classify(input);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/nlclassifier/BertNLClassifier.java)
+for more details.
+
+## Run inference in C++
+
+Note: We are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+std::unique_ptr<BertNLClassifier> classifier = BertNLClassifier::CreateFromFile(model_path).value();
+
+// Run inference
+std::vector<core::Category> categories = classifier->Classify(kInput);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/nlclassifier/bert_nl_classifier.h)
+for more details.
+
+## Example results
+
+Here is an example of the classification results of movie reviews using the
+[MobileBert](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification)
+model from Model Maker.
+
+Input: "it's a charming and often affecting journey"
+
+Output:
+
+```
+category[0]: 'negative' : '0.00006'
+category[1]: 'positive' : '0.99994'
+```
+
+Try out the simple
+[CLI demo tool for BertNLClassifier](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/examples/task/text/desktop/README.md#bertnlclassifier)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `BetNLClassifier` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The Metadata should meet the following requiresments:
+
+*   input_process_units for Wordpiece/Sentencepiece Tokenizer
+
+*   3 input tensors with names "ids", "mask" and "segment_ids" for the output of
+    the tokenizer
+
+*   1 output tensor of type float32, with a optionally attached label file. If a
+    label file is attached, the file should be a plain text file with one label
+    per line and the number of labels should match the number of categories as
+    the model outputs.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
new file mode 100644
index 00000000000..5b75609343b
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
@@ -0,0 +1,134 @@
+# Bert question answerer
+
+The Task Library `BertQuestionAnswerer` API loads a Bert model and answers
+questions based on the content of a given passage. For more information, see the
+documentation for the Question-Answer model
+<a href="../../models/bert_qa/overview.md">here</a>.
+
+## Key features of the BertQuestionAnswerer API
+
+*   Takes two text inputs as question and context and outputs a list of possible
+    answers.
+
+*   Performs out-of-graph Wordpiece or Sentencepiece tokenizations on input
+    text.
+
+## Supported BertQuestionAnswerer models
+
+The following models are compatible with the `BertNLClassifier` API.
+
+*   Models created by
+    [TensorFlow Lite Model Maker for Question Answer](https://www.tensorflow.org/lite/tutorials/model_maker_question_answer).
+
+*   The
+    [pretrained ALBERT models on TensorFlow Hub](https://tfhub.dev/tensorflow/albert_lite_base/1).
+
+*   The
+    [pretrained MobileBERT models on TensorFlow Hub](https://tfhub.dev/tensorflow/tfjs-model/mobilebert/1).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Text Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+### Step 2: Run inference using the API
+
+```java
+// Initialization
+BertQuestionAnswerer answerer = BertQuestionAnswerer.createFromFile(androidContext, modelFile);
+
+// Run inference
+List<QaAnswer> answers = answerer.answer(contextOfTheQuestion, questionToAsk);
+);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/qa/BertQuestionAnswerer.java)
+for more details.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+std::unique_ptr<BertQuestionAnswerer> answerer = BertQuestionAnswerer::CreateFromFile(model_file).value();
+
+// Run inference
+std::vector<QaAnswer> positive_results = answerer->Answer(context_of_question, question_to_ask);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/qa/bert_question_answerer.h)
+for more details.
+
+## Example results
+
+Here is an example of the answer results of
+[ALBERT model](https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1).
+
+Context: "A bunny is white and it's very fluffy. You don't want to eat a bunny
+because bunny is so cute."
+
+Question: "what's the color of bunny?"
+
+Answers:
+
+```
+answer[0]: 'white'
+    logit: '13.98366, start_index: 13, end_index: 13
+answer[1]: 'bunny is white'
+    logit: '6.84057, start_index: 11, end_index: 13
+answer[2]: 'white and it's very fluffy.'
+    logit: '6.73246, start_index: 13, end_index: 20
+answer[3]: 'white and it's very fluffy.'
+    logit: '6.60175, start_index: 13, end_index: 19
+answer[4]: 'is white'
+    logit: '6.05076, start_index: 12, end_index: 13
+```
+
+Try out the simple
+[CLI demo tool for BertQuestionAnswerer](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/examples/task/text/desktop/README.md#bert-question-answerer)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `BertQuestionAnswerer` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The Metadata should meet the following requiresments:
+
+*   `input_process_units` for Wordpiece/Sentencepiece Tokenizer
+
+*   3 input tensors with names "ids", "mask" and "segment_ids" for the output of
+    the tokenizer
+
+*   2 output tensors with names "end_logits" and "start_logits" to indicate the
+    answer's relative position in the context
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
new file mode 100644
index 00000000000..68e701d0796
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/customized_task_api.md
@@ -0,0 +1,448 @@
+# Build you own Task API
+
+<a href="overview.md">TensorFlow Lite Task Library</a> provides prebuilt
+native/Android/iOS APIs on top of the same infrastructure that abstracts
+TensorFlow. You can extend the Task API infrastructure to build customized APIs
+if your model is not supported by existing Task libraries.
+
+## Overview
+
+Task API infrastructure has a two-layer structure: the bottom C++ layer
+encapsulating the native TFLite runtime and the top Java/ObjC layer that
+communicates with the C++ layer through JNI or native wrapper.
+
+Implementing all the TensorFlow logic in only C++ minimizes cost, maximizes
+inference performance and simplifies the overall workflow across platforms.
+
+To create a Task class, extend the
+[BaseTaskApi](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h)
+to provide conversion logic between TFLite model interface and Task API
+interface, then use the Java/ObjC utilities to create corresponding APIs. With
+all TensorFlow details hidden, you can deploy the TFLite model in your apps
+without any machine learning knowledge.
+
+TensorFlow Lite provides some prebuilt APIs for most popular
+<a href="overview.md#supported_tasks">Vision and NLP tasks</a>. You can build
+your own APIs for other tasks using the Task API infrastructure.
+
+<div align="center">![prebuilt_task_apis](images/prebuilt_task_apis.svg)
+<div align="center">Figure 1. prebuilt Task APIs
+<div align="left">
+
+## Build your own API with Task API infra
+
+### C++ API
+
+All TFLite details are implemented in the native API. Create an API object by
+using one of the factory functions and get model results by calling functions
+defined in the interface.
+
+#### Sample usage
+
+Here is an example using the C++
+[`BertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/qa/bert_question_answerer.h)
+for
+[MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1).
+
+```cpp
+  char kBertModelPath[] = "path/to/model.tflite";
+  // Create the API from a model file
+  std::unique_ptr<BertQuestionAnswerer> question_answerer =
+      BertQuestionAnswerer::CreateFromFile(kBertModelPath);
+
+  char kContext[] = ...; // context of a question to be answered
+  char kQuestion[] = ...; // question to be answered
+  // ask a question
+  std::vector<QaAnswer> answers = question_answerer.Answer(kContext, kQuestion);
+  // answers[0].text is the best answer
+```
+
+#### Building the API
+
+<div align="center">![native_task_api](images/native_task_api.svg)
+<div align="center">Figure 2. Native Task API
+<div align="left">
+
+To build an API object,you must provide the following information by extending
+[`BaseTaskApi`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h)
+
+*   __Determine the API I/O__ - Your API should expose similar input/output
+    across different platforms. e.g `BertQuestionAnswerer` takes two strings
+    `(std::string& context, std::string& question)` as input and outputs a
+    vector of possible answer and probabilities as `std::vector<QaAnswer>`. This
+    is done by specifying the corresponding types in `BaseTaskApi`'s
+    [template parameter](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h?q="template <class OutputType, class... InputTypes>").
+    With the template parameters specified, the
+    [`BaseTaskApi::Infer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/base_task_api.h?q="Infer\(InputTypes... args\)")
+    function will have the correct input/output types. This function can be
+    directly called by API clients, but it is a good practice to wrap it inside
+    a model-specific function, in this case, `BertQuestionAnswerer::Answer`.
+
+    ```cpp
+    class BertQuestionAnswerer : public BaseTaskApi<
+                                  std::vector<QaAnswer>, // OutputType
+                                  const std::string&, const std::string& // InputTypes
+                                  > {
+      // Model specific function delegating calls to BaseTaskApi::Infer
+      std::vector<QaAnswer> Answer(const std::string& context, const std::string& question) {
+        return Infer(context, question).value();
+      }
+    }
+    ```
+
+*   __Provide conversion logic between API I/O and input/output tensor of the
+    model__ - With input and output types specified, the subclasses also need to
+    implement the typed functions
+    [`BaseTaskApi::Preprocess`](https://github.com/tensorflow/tflite-support/blob/5cea306040c40b06d6e0ed4e5baf6c307db7bd00/tensorflow_lite_support/cc/task/core/base_task_api.h#L74)
+    and
+    [`BaseTaskApi::Postprocess`](https://github.com/tensorflow/tflite-support/blob/5cea306040c40b06d6e0ed4e5baf6c307db7bd00/tensorflow_lite_support/cc/task/core/base_task_api.h#L80).
+    The two functions provide
+    [inputs](https://github.com/tensorflow/tensorflow/blob/1b84e5af78f85b8d3c4687b7dee65b78113f81cc/tensorflow/lite/schema/schema.fbs#L1007)
+    and
+    [outputs](https://github.com/tensorflow/tensorflow/blob/1b84e5af78f85b8d3c4687b7dee65b78113f81cc/tensorflow/lite/schema/schema.fbs#L1008)
+    from the TFLite `FlatBuffer`. The subclass is responsible for assigning
+    values from the API I/O to I/O tensors. See the complete implementation
+    example in
+    [`BertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/qa/bert_question_answerer.cc).
+
+    ```cpp
+    class BertQuestionAnswerer : public BaseTaskApi<
+                                  std::vector<QaAnswer>, // OutputType
+                                  const std::string&, const std::string& // InputTypes
+                                  > {
+      // Convert API input into into tensors
+      absl::Status BertQuestionAnswerer::Preprocess(
+        const std::vector<TfLiteTensor*>& input_tensors, // input tensors of the model
+        const std::string& context, const std::string& query // InputType of the API
+      ) {
+        // Perform tokenization on input strings
+        ...
+        // Populate IDs, Masks and SegmentIDs to corresponding input tensors
+        PopulateTensor(input_ids, input_tensors[0]);
+        PopulateTensor(input_mask, input_tensors[1]);
+        PopulateTensor(segment_ids, input_tensors[2]);
+        return absl::OkStatus();
+      }
+
+      // Convert output tensors into API output
+      StatusOr<std::vector<QaAnswer>> // OutputType
+      BertQuestionAnswerer::Postprocess(
+        const std::vector<const TfLiteTensor*>& output_tensors, // output tensors of the model
+      ) {
+        // Get start/end logits of prediction result from output tensors
+        std::vector<float> end_logits;
+        std::vector<float> start_logits;
+        // output_tensors[0]: end_logits FLOAT[1, 384]
+        PopulateVector(output_tensors[0], &end_logits);
+        // output_tensors[1]: start_logits FLOAT[1, 384]
+        PopulateVector(output_tensors[1], &start_logits);
+        ...
+        std::vector<QaAnswer::Pos> orig_results;
+        // Look up the indices from vocabulary file and build results
+        ...
+        return orig_results;
+      }
+    }
+    ```
+
+*   __Create factory functions of the API__ - A model file and a
+    [`OpResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/api/op_resolver.h)
+    are needed to initialize the
+    [`tflite::Interpreter`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h).
+    [`TaskAPIFactory`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/task_api_factory.h)
+    provides utility functions to create BaseTaskApi instances.
+
+    Note: By default
+    [`TaskAPIFactory`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/core/task_api_factory.h)
+    provides a
+    [`BuiltInOpResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/register.h).
+    If your model needs customized ops or a subset of built-in ops, you can
+    register them by creating a
+    [`MutableOpResolver`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/mutable_op_resolver.h).
+
+    You must also provide any files associated with the model. e.g,
+    `BertQuestionAnswerer` can also have an additional file for its tokenizer's
+    vocabulary.
+
+    ```cpp
+    class BertQuestionAnswerer : public BaseTaskApi<
+                                  std::vector<QaAnswer>, // OutputType
+                                  const std::string&, const std::string& // InputTypes
+                                  > {
+      // Factory function to create the API instance
+      StatusOr<std::unique_ptr<QuestionAnswerer>>
+      BertQuestionAnswerer::CreateBertQuestionAnswerer(
+          const std::string& path_to_model, // model to passed to TaskApiFactory
+          const std::string& path_to_vocab  // additional model specific files
+      ) {
+        // Creates an API object by calling one of the utils from TaskAPIFactory
+        std::unique_ptr<BertQuestionAnswerer> api_to_init;
+        ASSIGN_OR_RETURN(
+            api_to_init,
+            core::TaskAPIFactory::CreateFromFile<BertQuestionAnswerer>(
+                path_to_model,
+                absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>(),
+                kNumLiteThreads));
+
+        // Perform additional model specific initializations
+        // In this case building a vocabulary vector from the vocab file.
+        api_to_init->InitializeVocab(path_to_vocab);
+        return api_to_init;
+      }
+    }
+    ```
+
+### Android API
+
+Create Android APIs by defining Java/Kotlin interface and delegating the logic
+to the C++ layer through JNI. Android API requires native API to be built first.
+
+#### Sample usage
+
+Here is an example using Java
+[`BertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/qa/BertQuestionAnswerer.java)
+for
+[MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1).
+
+```java
+  String BERT_MODEL_FILE = "path/to/model.tflite";
+  String VOCAB_FILE = "path/to/vocab.txt";
+  // Create the API from a model file and vocabulary file
+    BertQuestionAnswerer bertQuestionAnswerer =
+        BertQuestionAnswerer.createBertQuestionAnswerer(
+            ApplicationProvider.getApplicationContext(), BERT_MODEL_FILE, VOCAB_FILE);
+
+  String CONTEXT = ...; // context of a question to be answered
+  String QUESTION = ...; // question to be answered
+  // ask a question
+  List<QaAnswer> answers = bertQuestionAnswerer.answer(CONTEXT, QUESTION);
+  // answers.get(0).text is the best answer
+```
+
+#### Building the API
+
+<div align="center">![android_task_api](images/android_task_api.svg)
+<div align="center">Figure 3. Android Task API
+<div align="left">
+
+Similar to Native APIs, to build an API object, the client needs to provide the
+following information by extending
+[`BaseTaskApi`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/core/BaseTaskApi.java),
+which provides JNI handlings for all Java Task APIs.
+
+*   __Determine the API I/O__ - This usually mirriors the native interfaces. e.g
+    `BertQuestionAnswerer` takes `(String context, String question)` as input
+    and outputs `List<QaAnswer>`. The implementation calls a private native
+    function with similar signature, except it has an additional parameter `long
+    nativeHandle`, which is the pointer returned from C++.
+
+    ```java
+    class BertQuestionAnswerer extends BaseTaskApi {
+      public List<QaAnswer> answer(String context, String question) {
+        return answerNative(getNativeHandle(), context, question);
+      }
+
+      private static native List<QaAnswer> answerNative(
+                                            long nativeHandle, // C++ pointer
+                                            String context, String question // API I/O
+                                           );
+
+    }
+    ```
+
+*   __Create factory functions of the API__ - This also mirrors native factory
+    functions, except Android factory functions also need to take
+    [`Context`](https://developer.android.com/reference/android/content/Context)
+    for file access. The implementation calls one of the utilities in
+    [`TaskJniUtils`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/core/TaskJniUtils.java)
+    to build the corresponding C++ API object and pass its pointer to the
+    `BaseTaskApi` constructor.
+
+    ```java
+      class BertQuestionAnswerer extends BaseTaskApi {
+        private static final String BERT_QUESTION_ANSWERER_NATIVE_LIBNAME =
+                                                  "bert_question_answerer_jni";
+
+        // Extending super constructor by providing the
+        // native handle(pointer of corresponding C++ API object)
+        private BertQuestionAnswerer(long nativeHandle) {
+          super(nativeHandle);
+        }
+
+        public static BertQuestionAnswerer createBertQuestionAnswerer(
+                                            Context context, // Accessing Android files
+                                            String pathToModel, String pathToVocab) {
+          return new BertQuestionAnswerer(
+              // The util first try loads the JNI module with name
+              // BERT_QUESTION_ANSWERER_NATIVE_LIBNAME, then opens two files,
+              // converts them into ByteBuffer, finally ::initJniWithBertByteBuffers
+              // is called with the buffer for a C++ API object pointer
+              TaskJniUtils.createHandleWithMultipleAssetFilesFromLibrary(
+                  context,
+                  BertQuestionAnswerer::initJniWithBertByteBuffers,
+                  BERT_QUESTION_ANSWERER_NATIVE_LIBNAME,
+                  pathToModel,
+                  pathToVocab));
+        }
+
+        // modelBuffers[0] is tflite model file buffer, and modelBuffers[1] is vocab file buffer.
+        // returns C++ API object pointer casted to long
+        private static native long initJniWithBertByteBuffers(ByteBuffer... modelBuffers);
+
+      }
+    ```
+
+*   __Implement the JNI module for native functions__ - All Java native methods
+    are implemented by calling a corresponding native function from the JNI
+    module. The factory functions would create a native API object and return
+    its pointer as a long type to Java. In later calls to Java API, the long
+    type pointer is passed back to JNI and cast back to the native API object.
+    The native API results are then converted back to Java results.
+
+    For example, this is how
+    [bert_question_answerer_jni](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/native/task/text/qa/bert_question_answerer_jni.cc)
+    is implemented.
+
+    ```cpp
+      // Implements BertQuestionAnswerer::initJniWithBertByteBuffers
+      extern "C" JNIEXPORT jlong JNICALL
+      Java_org_tensorflow_lite_task_text_qa_BertQuestionAnswerer_initJniWithBertByteBuffers(
+          JNIEnv* env, jclass thiz, jobjectArray model_buffers) {
+        // Convert Java ByteBuffer object into a buffer that can be read by native factory functions
+        absl::string_view model =
+            GetMappedFileBuffer(env, env->GetObjectArrayElement(model_buffers, 0));
+
+        // Creates the native API object
+        absl::StatusOr<std::unique_ptr<QuestionAnswerer>> status =
+            BertQuestionAnswerer::CreateFromBuffer(
+                model.data(), model.size());
+        if (status.ok()) {
+          // converts the object pointer to jlong and return to Java.
+          return reinterpret_cast<jlong>(status->release());
+        } else {
+          return kInvalidPointer;
+        }
+      }
+
+      // Implements BertQuestionAnswerer::answerNative
+      extern "C" JNIEXPORT jobject JNICALL
+      Java_org_tensorflow_lite_task_text_qa_BertQuestionAnswerer_answerNative(
+      JNIEnv* env, jclass thiz, jlong native_handle, jstring context, jstring question) {
+      // Convert long to native API object pointer
+      QuestionAnswerer* question_answerer = reinterpret_cast<QuestionAnswerer*>(native_handle);
+
+      // Calls the native API
+      std::vector<QaAnswer> results = question_answerer->Answer(JStringToString(env, context),
+                                             JStringToString(env, question));
+
+      // Converts native result(std::vector<QaAnswer>) to Java result(List<QaAnswerer>)
+      jclass qa_answer_class =
+        env->FindClass("org/tensorflow/lite/task/text/qa/QaAnswer");
+      jmethodID qa_answer_ctor =
+        env->GetMethodID(qa_answer_class, "<init>", "(Ljava/lang/String;IIF)V");
+      return ConvertVectorToArrayList<QaAnswer>(
+        env, results,
+        [env, qa_answer_class, qa_answer_ctor](const QaAnswer& ans) {
+          jstring text = env->NewStringUTF(ans.text.data());
+          jobject qa_answer =
+              env->NewObject(qa_answer_class, qa_answer_ctor, text, ans.pos.start,
+                             ans.pos.end, ans.pos.logit);
+          env->DeleteLocalRef(text);
+          return qa_answer;
+        });
+      }
+
+      // Implements BaseTaskApi::deinitJni by delete the native object
+      extern "C" JNIEXPORT void JNICALL Java_task_core_BaseTaskApi_deinitJni(
+          JNIEnv* env, jobject thiz, jlong native_handle) {
+        delete reinterpret_cast<QuestionAnswerer*>(native_handle);
+      }
+    ```
+
+### iOS API
+
+Create iOS APIs by wrapping a native API object into a ObjC API object. The
+created API object can be used in either ObjC or Swift. iOS API requires the
+native API to be built first.
+
+#### Sample usage
+
+Here is an example using ObjC
+[`TFLBertQuestionAnswerer`](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/ios/task/text/qa/Sources/TFLBertQuestionAnswerer.h)
+for [MobileBert](https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1)
+in Swfit.
+
+```swift
+  static let mobileBertModelPath = "path/to/model.tflite";
+  // Create the API from a model file and vocabulary file
+  let mobileBertAnswerer = TFLBertQuestionAnswerer.mobilebertQuestionAnswerer(
+      modelPath: mobileBertModelPath)
+
+  static let context = ...; // context of a question to be answered
+  static let question = ...; // question to be answered
+  // ask a question
+  let answers = mobileBertAnswerer.answer(
+      context: TFLBertQuestionAnswererTest.context, question: TFLBertQuestionAnswererTest.question)
+  // answers.[0].text is the best answer
+```
+
+#### Building the API
+
+<div align="center">![ios_task_api](images/ios_task_api.svg)
+<div align="center">Figure 4. iOS Task API
+<div align="left">
+
+iOS API is a simple ObjC wrapper on top of native API. Build the API by
+following the steps below:
+
+*   __Define the ObjC wrapper__ - Define an ObjC class and delegate the
+    implementations to the corresponding native API object. Note the native
+    dependencies can only appear in a .mm file due to Swift's inability to
+    interop with C++.
+
+    *   .h file
+
+    ```objc
+      @interface TFLBertQuestionAnswerer : NSObject
+
+      // Delegate calls to the native BertQuestionAnswerer::CreateBertQuestionAnswerer
+      + (instancetype)mobilebertQuestionAnswererWithModelPath:(NSString*)modelPath
+                                                    vocabPath:(NSString*)vocabPath
+          NS_SWIFT_NAME(mobilebertQuestionAnswerer(modelPath:vocabPath:));
+
+      // Delegate calls to the native BertQuestionAnswerer::Answer
+      - (NSArray<TFLQAAnswer*>*)answerWithContext:(NSString*)context
+                                         question:(NSString*)question
+          NS_SWIFT_NAME(answer(context:question:));
+    }
+    ```
+
+    *   .mm file
+
+    ```objc
+      using BertQuestionAnswererCPP = ::tflite::task::text::qa::BertQuestionAnswerer;
+
+      @implementation TFLBertQuestionAnswerer {
+        // define an iVar for the native API object
+        std::unique_ptr<QuestionAnswererCPP> _bertQuestionAnswerwer;
+      }
+
+      // Initilalize the native API object
+      + (instancetype)mobilebertQuestionAnswererWithModelPath:(NSString *)modelPath
+                                              vocabPath:(NSString *)vocabPath {
+        absl::StatusOr<std::unique_ptr<QuestionAnswererCPP>> cQuestionAnswerer =
+            BertQuestionAnswererCPP::CreateBertQuestionAnswerer(MakeString(modelPath),
+                                                                MakeString(vocabPath));
+        _GTMDevAssert(cQuestionAnswerer.ok(), @"Failed to create BertQuestionAnswerer");
+        return [[TFLBertQuestionAnswerer alloc]
+            initWithQuestionAnswerer:std::move(cQuestionAnswerer.value())];
+      }
+
+      // Calls the native API and converts C++ results into ObjC results
+      - (NSArray<TFLQAAnswer *> *)answerWithContext:(NSString *)context question:(NSString *)question {
+        std::vector<QaAnswerCPP> results =
+          _bertQuestionAnswerwer->Answer(MakeString(context), MakeString(question));
+        return [self arrayFromVector:results];
+      }
+    }
+    ```
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
new file mode 100644
index 00000000000..7b1c765baea
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
@@ -0,0 +1,170 @@
+# Integrate image classifiers
+
+Image classification is a common use of machine learning to identify what an
+image represents. For example, we might want to know what type of animal appears
+in a given picture. The task of predicting what an image represents is called
+_image classification_. An image classifier is trained to recognize various
+classes of images. For example, a model might be trained to recognize photos
+representing three different types of animals: rabbits, hamsters, and dogs. See
+the
+[introduction of image classification](../../models/image_classification/overview.md)
+for more information about image classifiers.
+
+Use the Task Library `ImageClassifier` API to deploy your custom image
+classifiers or pretrained ones into your model apps.
+
+## Key features of the ImageClassifier API
+
+*   Input image processing, including rotation, resizing, and color space
+    conversion.
+
+*   Region of interest of the input image.
+
+*   Label map locale.
+
+*   Score threshold to filter results.
+
+*   Top-k classification results.
+
+*   Label allowlist and denylist.
+
+## Supported image classifier models
+
+The following models are guaranteed to be compatible with the `ImageClassifier`
+API.
+
+*   Models created by
+    [TensorFlow Lite Model Maker for Image Classfication](https://www.tensorflow.org/lite/tutorials/model_maker_image_classification).
+
+*   The
+    [pretrained image classification models from TensorFlow Lite Hosted Models](https://www.tensorflow.org/lite/guide/hosted_models#image_classification).
+
+*   The
+    [pretrained image classification models on TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification).
+
+*   Models created by
+    [AutoML Vision Edge Image Classification](https://cloud.google.com/vision/automl/docs/edge-quickstart).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Vision Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+
+}
+```
+
+### Step 2: Using the model
+
+```java
+// Initialization
+ImageClassifierOptions options = ImageClassifierOptions.builder().setMaxResults(1).build();
+ImageClassifier imageClassifier = ImageClassifier.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Classifications> results = imageClassifier.classify(image);
+```
+
+See the
+[source code and javadoc](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/vision/classifier/ImageClassifier.java)
+for more options to configure `ImageClassifier`.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+ImageClassifierOptions options;
+options.mutable_model_file_with_metadata()->set_file_name(model_file);
+std::unique_ptr<ImageClassifier> image_classifier = ImageClassifier::CreateFromOptions(options).value();
+
+// Run inference
+const ClassificationResult result = image_classifier->Classify(*frame_buffer).value();
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/vision/image_classifier.h)
+for more options to configure `ImageClassifier`.
+
+## Example results
+
+Here is an example of the classification results of a
+[bird classifier](https://tfhub.dev/google/lite-model/aiy/vision/classifier/birds_V1/3).
+
+<img src="images/sparrow.jpg" alt="sparrow" width="50%">
+
+```
+Results:
+  Rank #0:
+   index       : 671
+   score       : 0.91406
+   class name  : /m/01bwb9
+   display name: Passer domesticus
+  Rank #1:
+   index       : 670
+   score       : 0.00391
+   class name  : /m/01bwbt
+   display name: Passer montanus
+  Rank #2:
+   index       : 495
+   score       : 0.00391
+   class name  : /m/0bwm6m
+   display name: Passer italiae
+```
+
+Try out the simple
+[CLI demo tool for ImageClassifier](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#image-classifier)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `ImageClassifier` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The compatible image classifier models should meet the following requirements:
+
+*   Input image tensor (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   image input of size `[batch x height x width x channels]`.
+    -   batch inference is not supported (`batch` is required to be 1).
+    -   only RGB inputs are supported (`channels` is required to be 3).
+    -   if type is kTfLiteFloat32, NormalizationOptions are required to be
+        attached to the metadata for input normalization.
+
+*   Output score tensor (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   with `N` classes and either 2 or 4 dimensions, i.e. `[1 x N]` or `[1 x 1
+        x 1 x N]`
+    -   optional (but recommended) label map(s) as AssociatedFile-s with type
+        TENSOR_AXIS_LABELS, containing one label per line. The first such
+        AssociatedFile (if any) is used to fill the `label` field (named as
+        `class_name` in C++) of the results. The `display_name` field is filled
+        from the AssociatedFile (if any) whose locale matches the
+        `display_names_locale` field of the `ImageClassifierOptions` used at
+        creation time ("en" by default, i.e. English). If none of these are
+        available, only the `index` field of the results will be filled.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
new file mode 100644
index 00000000000..40e5f7b0e44
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
@@ -0,0 +1,162 @@
+# Integrate image segmenters
+
+Image segmenters predict whether each pixel of an image is associated with a
+certain class. This is in contrast to
+<a href="../../models/object_detection/overview.md">object detection</a>, which
+detects objects in rectangular regions, and
+<a href="../../models/image_classification/overview.md">image
+classification</a>, which classifies the overall image. See the
+[introduction of image segmentation](../../models/segmentation/overview.md) for
+more information about image segmenters.
+
+Use the Task Library `ImageSegmenter` API to deploy your custom image segmenters
+or pretrained ones into your model apps.
+
+## Key features of the ImageSegmenter API
+
+*   Input image processing, including rotation, resizing, and color space
+    conversion.
+
+*   Label map locale.
+
+*   Two output types, category mask and confidence masks.
+
+*   Colored label for display purpose.
+
+## Supported image segmenter models
+
+The following models are guaranteed to be compatible with the `ImageSegmenter`
+API.
+
+*   The
+    [pretrained image segmentation models on TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Vision Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+}
+```
+
+### Step 2: Using the model
+
+```java
+// Initialization
+ImageSegmenterOptions options = ImageSegmenterOptions.builder().setOutputType(OutputType.CONFIDENCE_MASK).build();
+ImageSegmenter imageSegmenter = ImageSegmenter.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Segmentation> results = imageSegmenter.segment(image);
+```
+
+See the
+[source code and javadoc](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/vision/segmenter/ImageSegmenter.java)
+for more options to configure `ImageSegmenter`.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+ImageSegmenterOptions options;
+options.mutable_model_file_with_metadata()->set_file_name(model_file);
+std::unique_ptr<ImageSegmenter> image_segmenter = ImageSegmenter::CreateFromOptions(options).value();
+
+// Run inference
+const SegmentationResult result = image_segmenter->Segment(*frame_buffer).value();
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/vision/image_segmenter.h)
+for more options to configure `ImageSegmenter`.
+
+## Example results
+
+Here is an example of the segmentation results of
+[deeplab_v3](https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/1), a
+generic segmentation model available on TensorFlow Hub.
+
+<img src="images/plane.jpg" alt="plane" width="50%">
+
+```
+Color Legend:
+ (r: 000, g: 000, b: 000):
+  index       : 0
+  class name  : background
+ (r: 128, g: 000, b: 000):
+  index       : 1
+  class name  : aeroplane
+
+# (omitting multiple lines for conciseness) ...
+
+ (r: 128, g: 192, b: 000):
+  index       : 19
+  class name  : train
+ (r: 000, g: 064, b: 128):
+  index       : 20
+  class name  : tv
+Tip: use a color picker on the output PNG file to inspect the output mask with
+this legend.
+```
+
+The segmentation category mask should looks like:
+
+<img src="images/segmentation-output.png" alt="segmentation-output" width="30%">
+
+Try out the simple
+[CLI demo tool for ImageClassifier](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#image-segmenter)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `ImageSegmenter` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+*   Input image tensor (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   image input of size `[batch x height x width x channels]`.
+    -   batch inference is not supported (`batch` is required to be 1).
+    -   only RGB inputs are supported (`channels` is required to be 3).
+    -   if type is kTfLiteFloat32, NormalizationOptions are required to be
+        attached to the metadata for input normalization.
+
+*   Output masks tensor: (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   tensor of size `[batch x mask_height x mask_width x num_classes]`, where
+        `batch` is required to be 1, `mask_width` and `mask_height` are the
+        dimensions of the segmentation masks produced by the model, and
+        `num_classes` is the number of classes supported by the model.
+    -   optional (but recommended) label map(s) can be attached as
+        AssociatedFile-s with type TENSOR_AXIS_LABELS, containing one label per
+        line. The first such AssociatedFile (if any) is used to fill the `label`
+        field (named as `class_name` in C++) of the results. The `display_name`
+        field is filled from the AssociatedFile (if any) whose locale matches
+        the `display_names_locale` field of the `ImageSegmenterOptions` used at
+        creation time ("en" by default, i.e. English). If none of these are
+        available, only the `index` field of the results will be filled.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/android_task_api.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/android_task_api.svg
new file mode 100644
index 00000000000..c9554b47e77
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/android_task_api.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"><svg xmlns="http://www.w3.org/2000/svg" width="491" height="594" xmlns:xlink="http://www.w3.org/1999/xlink"><desc style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Created with Raphaël 2.2.0</desc><defs style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><marker id="raphael-marker-endblock55-objjks7y" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-obj68k5o" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objev9wf" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objvt0oa" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker></defs><rect x="10" y="10" width="163.625" height="28" rx="0" ry="0" fill="#0000ff" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="15" y="15" width="153.625" height="18" rx="0" ry="0" fill="#0000ff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="91.8125" y="24" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Android Task API</tspan></text><rect x="10" y="48" width="154.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="67.609375" width="134.4375" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="87.21875" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Android client</tspan></text><rect x="10" y="517.3125" width="154.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="536.921875" width="134.4375" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="87.21875" y="545.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Android client</tspan></text><path fill="none" stroke="#000000" d="M87.21875,105.21875L87.21875,517.3125" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="184.4375" y="48" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="194.4375" y="58" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java </tspan><tspan dy="19.2" x="237.65625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><rect x="184.4375" y="517.3125" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="194.4375" y="527.3125" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="545.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java </tspan><tspan dy="19.2" x="237.65625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><path fill="none" stroke="#000000" d="M237.65625,105.21875L237.65625,517.3125" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="310.875" y="48" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="320.875" y="58" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="354.484375" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="354.484375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><rect x="310.875" y="517.3125" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="320.875" y="527.3125" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="354.484375" y="545.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="354.484375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><path fill="none" stroke="#000000" d="M354.484375,105.21875L354.484375,517.3125" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="104.828125" y="111.609375" width="115.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="162.4375" y="130.21875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java/Kotlin </tspan><tspan dy="19.2" x="162.4375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API input</tspan></text><path fill="none" stroke="#00acc1" d="M87.21875,162.4375C87.21875,162.4375,206.54771423339844,162.4375,232.65092515945435,162.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objjks7y)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="218.25" y="182.4375" width="38.8125" height="28" rx="0" ry="0" fill="#00acc1" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="223.25" y="187.4375" width="28.8125" height="18" rx="0" ry="0" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="196.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">JNI</tspan></text><rect x="252.859375" y="216.828125" width="86.421875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="296.0703125" y="235.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="296.0703125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API input</tspan></text><path fill="none" stroke="#00acc1" d="M237.65625,267.65625C237.65625,267.65625,327.1527045266703,267.65625,349.483118717651,267.65625" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obj68k5o)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="301.46875" y="287.65625" width="106.03125" height="47.21875" rx="0" ry="0" fill="#ff6f00" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="306.46875" y="292.65625" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="354.484375" y="311.265625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">model </tspan><tspan dy="19.2" x="354.484375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> invocation</tspan></text><rect x="248.0625" y="341.265625" width="96.015625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="296.0703125" y="359.875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="296.0703125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M354.484375,392.09375C354.484375,392.09375,264.9879204733297,392.09375,242.657506282349,392.09375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objev9wf)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="218.25" y="412.09375" width="38.8125" height="28" rx="0" ry="0" fill="#d4e157" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="223.25" y="417.09375" width="28.8125" height="18" rx="0" ry="0" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="237.65625" y="426.09375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">JNI</tspan></text><rect x="104.828125" y="446.484375" width="115.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="162.4375" y="465.09375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Java/Kotlin </tspan><tspan dy="19.2" x="162.4375" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M237.65625,497.3125C237.65625,497.3125,118.32728576660156,497.3125,92.22407484054565,497.3125" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objvt0oa)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/detection-output.png b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/detection-output.png
new file mode 100644
index 00000000000..c8d56f405c4
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/detection-output.png differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/dogs.jpg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/dogs.jpg
new file mode 100644
index 00000000000..9db4bee75d4
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/dogs.jpg differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/ios_task_api.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/ios_task_api.svg
new file mode 100644
index 00000000000..615b12347e9
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/ios_task_api.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"><svg xmlns="http://www.w3.org/2000/svg" width="452" height="632" xmlns:xlink="http://www.w3.org/1999/xlink"><desc style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Created with Raphaël 2.2.0</desc><defs style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><marker id="raphael-marker-endblock55-objk5zfc" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-obj0wlxn" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objqcufj" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objy4vsc" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker></defs><rect x="10" y="10" width="125.21875" height="28" rx="0" ry="0" fill="#0000ff" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="15" y="15" width="115.21875" height="18" rx="0" ry="0" fill="#0000ff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="72.609375" y="24" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">iOS Task API</tspan></text><rect x="10" y="48" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="67.609375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">iOS client</tspan></text><rect x="10" y="555.75" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="575.359375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="584.359375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">iOS client</tspan></text><path fill="none" stroke="#000000" d="M68.015625,105.21875L68.015625,555.75" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="146.03125" y="48" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="58" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">ObjC </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><rect x="146.03125" y="555.75" width="106.4375" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="565.75" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="584.359375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">ObjC </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> Interface</tspan></text><path fill="none" stroke="#000000" d="M199.25,105.21875L199.25,555.75" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="272.46875" y="48" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="282.46875" y="58" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="316.078125" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="316.078125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><rect x="272.46875" y="555.75" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="282.46875" y="565.75" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="316.078125" y="584.359375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="316.078125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API</tspan></text><path fill="none" stroke="#000000" d="M316.078125,105.21875L316.078125,555.75" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="80.8125" y="111.609375" width="105.640625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="133.6328125" y="130.21875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Swift/ObjC </tspan><tspan dy="19.2" x="133.6328125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API input</tspan></text><path fill="none" stroke="#00acc1" d="M68.015625,162.4375C68.015625,162.4375,170.23761106934398,162.4375,194.2458021334819,162.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objk5zfc)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="160.640625" y="182.4375" width="77.21875" height="47.21875" rx="0" ry="0" fill="#00acc1" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="165.640625" y="187.4375" width="67.21875" height="37.21875" rx="0" ry="0" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="206.046875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> wrapper</tspan></text><rect x="214.453125" y="236.046875" width="86.421875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="257.6640625" y="254.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="257.6640625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API input</tspan></text><path fill="none" stroke="#00acc1" d="M199.25,286.875C199.25,286.875,288.7464545266703,286.875,311.076868717651,286.875" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obj0wlxn)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="263.0625" y="306.875" width="106.03125" height="47.21875" rx="0" ry="0" fill="#ff6f00" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="268.0625" y="311.875" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="316.078125" y="330.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">model </tspan><tspan dy="19.2" x="316.078125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> invocation</tspan></text><rect x="209.65625" y="360.484375" width="96.015625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="257.6640625" y="379.09375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="257.6640625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M316.078125,411.3125C316.078125,411.3125,226.58167047332972,411.3125,204.251256282349,411.3125" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objqcufj)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="160.640625" y="431.3125" width="77.21875" height="47.21875" rx="0" ry="0" fill="#d4e157" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="165.640625" y="436.3125" width="67.21875" height="37.21875" rx="0" ry="0" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="199.25" y="454.921875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="199.25" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> wrapper</tspan></text><rect x="80.8125" y="484.921875" width="105.640625" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="133.6328125" y="503.53125" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Swift/ObjC </tspan><tspan dy="19.2" x="133.6328125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> API output</tspan></text><path fill="none" stroke="#d4e157" d="M199.25,535.75C199.25,535.75,97.02801393065602,535.75,73.0198228665181,535.75" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objy4vsc)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/native_task_api.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/native_task_api.svg
new file mode 100644
index 00000000000..e87c95a40c1
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/native_task_api.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"><svg xmlns="http://www.w3.org/2000/svg" width="659" height="556" xmlns:xlink="http://www.w3.org/1999/xlink"><desc style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">Created with Raphaël 2.2.0</desc><defs style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><marker id="raphael-marker-endblock55-obji09o4" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-obj7hc3v" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objuf9tb" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker><marker id="raphael-marker-endblock55-objokijf" markerHeight="5" markerWidth="5" orient="auto" refX="2.5" refY="2.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"><use xlink:href="#raphael-marker-block" transform="rotate(180 2.5 2.5) scale(1,1)" stroke-width="1.0000" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></use></marker></defs><rect x="10" y="10" width="154.03125" height="28" rx="0" ry="0" fill="#0000ff" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="15" y="15" width="144.03125" height="18" rx="0" ry="0" fill="#0000ff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="87.015625" y="24" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native Task API</tspan></text><rect x="10" y="48" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="67.609375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">C++ client</tspan></text><rect x="10" y="478.875" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="20" y="498.484375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="68.015625" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">C++ client</tspan></text><path fill="none" stroke="#000000" d="M68.015625,105.21875L68.015625,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="146.03125" y="48" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="67.609375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="204.046875" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">preprocess</tspan></text><rect x="146.03125" y="478.875" width="116.03125" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="156.03125" y="498.484375" width="96.03125" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="204.046875" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">preprocess</tspan></text><path fill="none" stroke="#000000" d="M204.046875,105.21875L204.046875,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="295.65625" y="48" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="305.65625" y="58" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="339.265625" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">TFLite </tspan><tspan dy="19.2" x="339.265625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> runtime</tspan></text><rect x="295.65625" y="478.875" width="87.21875" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="305.65625" y="488.875" width="67.21875" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="339.265625" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">TFLite </tspan><tspan dy="19.2" x="339.265625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> runtime</tspan></text><path fill="none" stroke="#000000" d="M339.265625,105.21875L339.265625,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="421.296875" y="48" width="125.625" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="431.296875" y="67.609375" width="105.625" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="484.109375" y="76.609375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">postprocess</tspan></text><rect x="421.296875" y="478.875" width="125.625" height="57.21875" rx="0" ry="0" fill="none" stroke="#000000" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="431.296875" y="498.484375" width="105.625" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="484.109375" y="507.484375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#000000" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">postprocess</tspan></text><path fill="none" stroke="#000000" d="M484.109375,105.21875L484.109375,478.875" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="92.8125" y="111.609375" width="86.4375" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="136.03125" y="130.21875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#00acc1" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="136.03125" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API input</tspan></text><path fill="none" stroke="#00acc1" d="M68.015625,162.4375C68.015625,162.4375,174.50227653980255,162.4375,199.04412201186642,162.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obji09o4)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="131.828125" y="182.4375" width="144.4375" height="28" rx="0" ry="0" fill="#00acc1" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="136.828125" y="187.4375" width="134.4375" height="18" rx="0" ry="0" fill="#00acc1" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="204.046875" y="196.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">data to tensor</tspan></text><rect x="214.046875" y="226.4375" width="115.21875" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="271.65625" y="235.4375" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ff6f00" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">input tensor</tspan></text><path fill="none" stroke="#ff6f00" d="M204.046875,248.4375C204.046875,248.4375,309.8390848468989,248.4375,334.2725395625157,248.4375" stroke-width="2" marker-end="url(#raphael-marker-endblock55-obj7hc3v)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="286.25" y="268.4375" width="106.03125" height="47.21875" rx="0" ry="0" fill="#ff6f00" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="291.25" y="273.4375" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ff6f00" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="339.265625" y="292.046875" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">model </tspan><tspan dy="19.2" x="339.265625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"> invocation</tspan></text><rect x="349.265625" y="331.65625" width="124.84375" height="18" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="411.6875" y="340.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ff6f00" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">output tensor</tspan></text><path fill="none" stroke="#ff6f00" d="M339.265625,353.65625C339.265625,353.65625,453.5913529600948,353.65625,479.1025139355652,353.65625" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objuf9tb)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path><rect x="411.890625" y="373.65625" width="144.4375" height="28" rx="0" ry="0" fill="#d4e157" stroke="#ffffff" stroke-width="2" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><rect x="416.890625" y="378.65625" width="134.4375" height="18" rx="0" ry="0" fill="#d4e157" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="484.109375" y="387.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#ffffff" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="5.5" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">tensor to data</tspan></text><rect x="228.046875" y="408.046875" width="96.03125" height="37.21875" rx="0" ry="0" fill="#ffffff" stroke="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></rect><text x="276.0625" y="426.65625" text-anchor="middle" font-family="Andale Mono, monospace" font-size="16px" stroke="none" fill="#d4e157" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); text-anchor: middle; font-family: &quot;Andale Mono&quot;, monospace; font-size: 16px;"><tspan dy="-4.1015625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">native </tspan><tspan dy="19.2" x="276.0625" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);">API output</tspan></text><path fill="none" stroke="#d4e157" d="M484.109375,458.875C484.109375,458.875,120.11422207392752,458.875,73.01523988378631,458.875" stroke-width="2" marker-end="url(#raphael-marker-endblock55-objokijf)" stroke-dasharray="none" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/plane.jpg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/plane.jpg
new file mode 100644
index 00000000000..0edefa40a03
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/plane.jpg differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/prebuilt_task_apis.svg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/prebuilt_task_apis.svg
new file mode 100644
index 00000000000..c9aced3dea5
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/prebuilt_task_apis.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 560.3700787401575 156.24409448818898" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l560.37006 0l0 156.2441l-560.37006 0l0 -156.2441z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l560.37006 0l0 156.2441l-560.37006 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m143.04504 24.922682l8.283463 0l0 3.8740158l-8.283463 0z" fill-rule="evenodd"/><path fill="#ff9900" d="m147.16116 8.367561l0 0c0 -2.968771 2.406662 -5.3754354 5.3754425 -5.3754354l78.225494 0c1.4256592 0 2.792923 0.566339 3.8010101 1.5744286c1.0080872 1.00809 1.5744324 2.3753529 1.5744324 3.8010068l0 21.501097c0 2.968771 -2.406662 5.375437 -5.3754425 5.375437l-78.225494 0c-2.9687805 0 -5.3754425 -2.4066658 -5.3754425 -5.375437z" fill-rule="evenodd"/><path fill="#000000" d="m161.50356 15.764359l3.0 0q2.296875 0 2.296875 1.781251q0 0.984375 -0.984375 1.5q0.640625 0.15625 0.984375 0.59375q0.359375 0.421875 0.359375 1.09375q0 1.03125 -0.703125 1.578125q-0.6875 0.546875 -2.015625 0.546875l-3.0 0l0 -0.859375l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.296875l0 -4.96875l-0.59375 -0.03125l0 -0.90625095zm1.90625 3.859376l0 2.234375l1.0 0q1.375 0 1.375 -1.125q0 -0.546875 -0.34375 -0.828125q-0.34375 -0.28125 -0.984375 -0.28125l-1.046875 0zm0 -2.875l0 1.9375l0.78125 0q0.65625 0 0.96875 -0.265625q0.328125 -0.28125 0.328125 -0.765625q0 -0.5 -0.328125 -0.703125q-0.3125 -0.203125 -0.890625 -0.203125l-0.859375 0zm9.227631 0.96875l0 3.953125q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.34375 0.015625l0 0.859375l-1.703125 0l0 -0.625l-0.03125 0q-0.53125 0.765625 -1.453125 0.765625q-1.09375 0 -1.625 -0.6875q-0.515625 -0.703125 -0.515625 -1.90625q0 -1.453125 0.703125 -2.25q0.703125 -0.8125 2.109375 -0.8125q0.90625 0 1.90625 0.375zm-1.234375 3.453125l0 -2.765625q-0.296875 -0.140625 -0.828125 -0.140625q-0.71875 0 -1.046875 0.578125q-0.3125 0.578125 -0.3125 1.515625q0 1.734375 1.109375 1.734375q0.46875 0 0.765625 -0.28125q0.3125 -0.28125 0.3125 -0.640625zm5.276535 -2.859375q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm4.69664 -0.96875q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm4.9389343 1.890625l0 -4.84375l-0.84375 0q-0.234375 0 -0.234375 0.328125l0 0.5l-1.09375 -0.078125l0 -1.812501l5.578125 0l0 1.812501l-1.09375 0.078125l0 -0.5q0 -0.171875 -0.046875 -0.25q-0.046875 -0.078125 -0.25 -0.078125l-0.734375 0l0 5.09375l0.84375 0.03125l0 0.90625l-3.0 0l0 -0.859375l0.5625 -0.03125q0.3125 -0.03125 0.3125 -0.296875zm8.70546 -3.953125l0 3.953125q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.34375 0.015625l0 0.859375l-1.703125 0l0 -0.625l-0.03125 0q-0.53125 0.765625 -1.453125 0.765625q-1.09375 0 -1.625 -0.6875q-0.515625 -0.703125 -0.515625 -1.90625q0 -1.453125 0.703125 -2.25q0.703125 -0.8125 2.109375 -0.8125q0.90625 0 1.90625 0.375zm-1.234375 3.453125l0 -2.765625q-0.296875 -0.140625 -0.828125 -0.140625q-0.71875 0 -1.046875 0.578125q-0.3125 0.578125 -0.3125 1.515625q0 1.734375 1.109375 1.734375q0.46875 0 0.765625 -0.28125q0.3125 -0.28125 0.3125 -0.640625zm5.276535 -2.859375q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm2.681015 3.359375l0 -5.203125q0 -0.15625 -0.0625 -0.21875q-0.0625 -0.078125 -0.203125 -0.078125l-0.375 -0.03125l0 -0.87500095l1.890625 0l0 4.609376q0.59375 -0.046875 1.03125 -0.359375q0.4375 -0.3125 0.59375 -0.8125q0.03125 -0.109375 0.03125 -0.171875q0 -0.171875 -0.203125 -0.171875l-0.40625 -0.03125l0 -0.828125l2.203125 0l0 0.859375l-0.5 0.03125q-0.140625 1.078125 -0.9375 1.703125l1.03125 1.875l0.65625 0.03125l0 0.859375l-1.5625 0l-1.203125 -2.21875q-0.359375 0.09375 -0.734375 0.15625l0 1.171875l0.015625 0l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.628067 0l1.71875 -4.96875l-0.671875 -0.0625l0 -0.87500095l2.4375 0l2.1875 6.156251l0 0l0.60939026 0.03125l0 0.90625l-2.7500153 0l0 -0.859375l0.40625 -0.03125q0.1875 -0.03125 0.25 -0.09375q0.0625 -0.0625 0 -0.234375l-0.21875 -0.65625l-2.5 0l-0.3125 0.9375l0.640625 0.03125l0 0.90625l-2.46875 0l0 -0.859375l0.359375 -0.03125q0.21875 -0.03125 0.3125 -0.296875zm2.71875 -4.703125l-0.984375 3.03125l1.984375 0l-0.96875 -3.03125l-0.03125 0zm4.3568115 6.90625l0 -5.15625q0 -0.1875 -0.0625 -0.25q-0.0625 -0.078125 -0.203125 -0.078125l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.6875q0.171875 -0.3125 0.609375 -0.578125q0.453125 -0.265625 1.078125 -0.265625q1.921875 0 1.921875 2.703125q0 1.4375 -0.625 2.203125q-0.625 0.765625 -1.703125 0.765625q-0.640625 0 -1.140625 -0.3125l0 1.484375l0.859375 0.03125l0 0.84375l-2.75 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.234375 -0.09375q0.078125 -0.0625 0.078125 -0.25zm3.375 -3.71875q0 -1.78125 -1.078125 -1.78125q-0.453125 0 -0.765625 0.28125q-0.3125 0.265625 -0.3125 0.640625l0 2.4375q0.40625 0.265625 0.984375 0.265625q0.578125 0 0.875 -0.53125q0.296875 -0.53125 0.296875 -1.3125zm4.0059357 -2.65625l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375095 0.21875 -0.56250095q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.56250095q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875z" fill-rule="nonzero"/><path fill="#ff9900" d="m126.44907 65.90535l0 0c0 -2.968769 2.4066696 -5.375435 5.375435 -5.375435l119.64284 0c1.4256439 0 2.7929077 0.5663376 3.8009949 1.5744286c1.0080872 1.008091 1.5744324 2.3753548 1.5744324 3.8010063l0 21.501099c0 2.968773 -2.406662 5.375435 -5.3754272 5.375435l-119.64284 0c-2.9687653 0 -5.375435 -2.406662 -5.375435 -5.375435z" fill-rule="evenodd"/><path fill="#000000" d="m148.8283 77.207146q0 1.546875 -0.65625 2.546875q-0.640625 0.984375 -1.875 1.3125l0 0.03125q0.921875 0.046875 1.765625 0.5625q0.59375 0.375 1.390625 0.375q0.1875 0 0.375 -0.03125l-0.140625 1.15625q-0.171875 0 -0.328125 0l-1.1875 -0.203125q-0.75 -0.25 -1.46875 -0.59375q-0.8125 -0.390625 -1.515625 -0.390625q-0.34375 0 -0.671875 0.078125l0.171875 -0.859375q-2.859375 -0.375 -2.859375 -3.984375q0 -1.109375 0.3125 -1.9375q0.328125 -0.84375 0.875 -1.296875q1.0625 -0.921875 2.484375 -0.921875q1.421875 0 2.375 1.015625q0.953125 1.0 0.953125 3.140625zm-5.5 -0.0625q0 2.96875 2.0625 2.96875q1.109375 0 1.5625 -0.90625q0.390625 -0.796875 0.390625 -1.96875q0 -1.046875 -0.375 -1.921875q-0.1875 -0.484375 -0.609375 -0.75q-0.40625 -0.28125 -1.0625 -0.28125q-0.65625 0 -1.140625 0.4375q-0.46875 0.421875 -0.65625 1.046875q-0.171875 0.609375 -0.171875 1.375zm12.749313 3.078125l0 0.953125l-1.9375 0l0 -0.78125q-0.65625 0.9375 -1.9375 0.9375q-1.859375 0 -1.859375 -2.078125l0 -2.765625q0 -0.328125 -0.3125 -0.34375l-0.375 -0.015625l0 -0.984375l2.109375 0l0 3.796875q0 0.640625 0.171875 0.96875q0.1875 0.3125 0.75 0.3125q0.5625 0 0.90625 -0.34375q0.34375 -0.34375 0.34375 -0.84375l0 -2.53125q0 -0.1875 -0.078125 -0.265625q-0.0625 -0.078125 -0.21875 -0.09375l-0.390625 -0.015625l0 -0.984375l2.109375 0l0 4.703125q0 0.1875 0.0625 0.25q0.0625 0.0625 0.25 0.09375l0.40625 0.03125zm3.7483063 -5.265625q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm6.4955444 -1.65625q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm3.479538 -2.1875l0.78125 0l0 1.28125l1.59375 0l-0.125 1.0l-1.46875 0l0 3.15625q0 0.53125 0.1875 0.75q0.1875 0.203125 0.59375 0.203125q0.40625 0 0.8125 -0.25l0.359375 0.921875q-0.609375 0.421875 -1.53125 0.421875q-0.53125 0 -0.90625 -0.140625q-0.375 -0.140625 -0.5625 -0.296875q-0.171875 -0.171875 -0.265625 -0.5q-0.09375 -0.328125 -0.109375 -0.515625q-0.015625 -0.1875 -0.015625 -0.546875l0 -3.203125l-0.84375 0l0.109375 -0.875q0.5625 -0.046875 0.84375 -0.390625q0.296875 -0.359375 0.546875 -1.015625zm5.3521576 1.28125l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm5.6614075 6.1875q1.3125 0 1.3125 -2.109375q0 -1.078125 -0.296875 -1.625q-0.296875 -0.546875 -0.984375 -0.546875q-0.6875 0 -1.015625 0.53125q-0.328125 0.515625 -0.328125 1.4375q0 1.6875 0.625 2.125q0.28125 0.1875 0.6875 0.1875zm-2.765625 -2.15625q0 -0.875 0.265625 -1.53125q0.265625 -0.65625 0.71875 -1.0q0.84375 -0.65625 1.875 -0.65625q0.71875 0 1.21875 0.234375q0.5 0.234375 0.78125 0.546875q0.28125 0.296875 0.46875 0.890625q0.203125 0.578125 0.203125 1.359375q0 1.65625 -0.8125 2.515625q-0.796875 0.859375 -2.046875 0.859375q-1.25 0 -1.96875 -0.8125q-0.703125 -0.8125 -0.703125 -2.40625zm6.2735443 -2.046875l0 -0.953125l2.03125 0l0 0.78125q0.296875 -0.453125 0.8125 -0.703125q0.53125 -0.265625 1.125 -0.265625q0.90625 0 1.40625 0.53125q0.5 0.515625 0.5 1.5625l0 3.125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -2.46875q0 -0.65625 -0.203125 -0.96875q-0.203125 -0.328125 -0.78125 -0.328125q-0.5625 0 -0.90625 0.359375q-0.328125 0.359375 -0.328125 0.84375l0 2.90625l0.671875 0.046875l0 0.953125l-2.84375 0l0 -0.90625l0.390625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -3.328125q0 -0.359375 -0.296875 -0.375l-0.5 -0.03125zm7.684433 3.734375l1.9375 -5.59375l-0.765625 -0.078125l0 -0.96875l2.75 0l2.453125 6.921875l0.015625 0l0.6875 0.046875l0 1.015625l-3.109375 0l0 -0.96875l0.46875 -0.046875q0.203125 -0.015625 0.265625 -0.078125q0.078125 -0.078125 0.015625 -0.28125l-0.25 -0.734375l-2.8125 0l-0.359375 1.046875l0.71875 0.046875l0 1.015625l-2.765625 0l0 -0.96875l0.390625 -0.046875q0.25 -0.015625 0.359375 -0.328125zm3.0625 -5.296875l-1.109375 3.421875l2.21875 0l-1.078125 -3.421875l-0.03125 0zm4.3971863 1.5625l0 -0.953125l2.03125 0l0 0.78125q0.296875 -0.453125 0.8125 -0.703125q0.53125 -0.265625 1.125 -0.265625q0.90625 0 1.40625 0.53125q0.5 0.515625 0.5 1.5625l0 3.125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -2.46875q0 -0.65625 -0.203125 -0.96875q-0.203125 -0.328125 -0.78125 -0.328125q-0.5625 0 -0.90625 0.359375q-0.328125 0.359375 -0.328125 0.84375l0 2.90625l0.671875 0.046875l0 0.953125l-2.84375 0l0 -0.90625l0.390625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -3.328125q0 -0.359375 -0.296875 -0.375l-0.5 -0.03125zm10.418808 -0.046875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm6.240158 -0.671875l1.15625 0l1.25 4.46875l0.015625 0l0.828125 -3.71875l-0.609375 -0.03125l0 -0.953125l2.40625 0l0 0.921875l-0.390625 0.015625q-0.25 0.03125 -0.328125 0.296875l-1.140625 4.796875l-1.734375 0l-0.984375 -3.578125l-0.03125 0l-1.015625 3.578125l-1.765625 0l-1.21875 -4.78125q-0.0625 -0.171875 -0.125 -0.234375q-0.0625 -0.0625 -0.21875 -0.078125l-0.34375 -0.015625l0 -0.921875l2.75 0l0 0.953125l-0.671875 0.03125l0.828125 3.6875l0.015625 0l1.328125 -4.4375zm8.57132 -0.421875q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm4.0892944 2.125l0 -3.328125q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.09375l-0.5 -0.03125l0 -0.96875l2.015625 0l0 0.890625q0.21875 -0.46875 0.671875 -0.765625q0.453125 -0.3125 1.046875 -0.3125q0.609375 0 1.140625 0.265625l0 1.796875l-1.109375 0.078125l0 -0.546875q0 -0.25 -0.125 -0.3125q-0.125 -0.046875 -0.328125 -0.046875q-0.46875 0 -0.78125 0.34375q-0.3125 0.328125 -0.3125 0.859375l0 2.78125l1.046875 0.046875l0 0.953125l-3.21875 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875zm7.809662 -4.875q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm4.0892944 2.125l0 -3.328125q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.09375l-0.5 -0.03125l0 -0.96875l2.015625 0l0 0.890625q0.21875 -0.46875 0.671875 -0.765625q0.453125 -0.3125 1.046875 -0.3125q0.609375 0 1.140625 0.265625l0 1.796875l-1.109375 0.078125l0 -0.546875q0 -0.25 -0.125 -0.3125q-0.125 -0.046875 -0.328125 -0.046875q-0.46875 0 -0.78125 0.34375q-0.3125 0.328125 -0.3125 0.859375l0 2.78125l1.046875 0.046875l0 0.953125l-3.21875 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875z" fill-rule="nonzero"/><path fill="#ff9900" d="m6.3882904 65.90535l0 0c0 -2.968769 2.4066648 -5.375435 5.375436 -5.375435l84.80818 0c1.4256592 0 2.792923 0.5663376 3.8010101 1.5744286c1.0080872 1.008091 1.5744247 2.3753548 1.5744247 3.8010063l0 21.501099c0 2.968773 -2.406662 5.375435 -5.375435 5.375435l-84.80818 0c-2.968771 0 -5.375436 -2.406662 -5.375436 -5.375435z" fill-rule="evenodd"/><path fill="#000000" d="m21.444216 81.175896l0 -0.96875l0.421875 -0.046875q0.3125 -0.015625 0.3125 -0.328125l0 -5.578125l-0.6875 -0.03125l0 -1.03125l2.171875 0l3.390625 5.515625l0.03125 0l0 -4.453125l-0.796875 -0.03125l0 -1.03125l2.921875 0l0 0.96875l-0.421875 0.046875q-0.296875 0.015625 -0.296875 0.375l0 6.59375l-1.40625 0l-3.46875 -5.59375l-0.015625 0l0 4.53125l0.78125 0.046875l0 1.015625l-2.9375 0zm14.147554 0l-5.687502 0l0 -0.96875l0.390625 -0.046875q0.34375 -0.03125 0.34375 -0.328125l0 -5.578125l-0.6875 -0.03125l0 -1.03125l2.906252 0l0 0.96875l-0.40625 0.046875q-0.34375 0.03125 -0.34375 0.375l0 5.359375l1.984375 0q0.171875 0 0.21875 -0.09375q0.0625 -0.09375 0.0625 -0.296875l0 -0.828125l1.21875 0.078125l0 2.375zm5.600296 -6.703125q-0.140625 -0.234375 -0.984375 -0.234375q-1.125 0 -1.703125 0.765625q-0.5625 0.75 -0.5625 2.171875q0 2.96875 2.21875 2.96875q0.03125 0 0.3125 0q0.28125 0 0.53125 -0.078125q0.25 -0.078125 0.296875 -0.15625q0.0625 -0.09375 0.0625 -0.265625l0 -0.96875l1.21875 0.09375l0 2.0625q-0.9375 0.5 -2.375 0.5q-1.84375 0 -2.8125 -1.03125q-0.953125 -1.03125 -0.953125 -3.078125q0 -1.125 0.3125 -1.953125q0.328125 -0.84375 0.890625 -1.3125q1.078125 -0.90625 2.5625 -0.90625q1.203125 0 2.25 0.5l0 2.03125l-1.21875 0.078125l0 -0.921875q0 -0.1875 -0.046875 -0.265625zm4.536545 -1.859375l0 7.5625l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -5.921875q0 -0.265625 -0.296875 -0.28125l-0.421875 -0.03125l0 -1.0l2.15625 0zm6.8103943 2.765625l0 4.46875q0 0.1875 0.0625 0.265625q0.078125 0.0625 0.234375 0.078125l0.390625 0.03125l0 0.953125l-1.90625 0l0 -0.703125l-0.03125 0q-0.609375 0.859375 -1.65625 0.859375q-1.21875 0 -1.8125 -0.78125q-0.578125 -0.78125 -0.578125 -2.140625q0 -1.625 0.78125 -2.53125q0.796875 -0.921875 2.375 -0.921875q1.03125 0 2.140625 0.421875zm-1.390625 3.890625l0 -3.109375q-0.328125 -0.15625 -0.921875 -0.15625q-0.8125 0 -1.171875 0.65625q-0.359375 0.640625 -0.359375 1.703125q0 1.953125 1.25 1.953125q0.53125 0 0.859375 -0.3125q0.34375 -0.328125 0.34375 -0.734375zm5.9602966 -3.21875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm5.412033 0q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.4218712 0.453125 0.4218712 1.140625q0 0.90625 -0.6562462 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm4.4276543 -0.90625l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm3.6614075 5.71875l0 -3.6875l-1.03125 0l0 -1.0l1.03125 0l0 -0.453125q0 -1.1875 0.53125 -1.703125q0.515625 -0.484375 1.484375 -0.484375q0.96875 0 1.71875 0.421875l-0.34375 0.953125q-0.578125 -0.296875 -1.078125 -0.296875q-0.5 0 -0.703125 0.25q-0.1875 0.234375 -0.1875 0.75l0 0.5625l1.78125 0l0 1.0l-1.78125 0l0 4.015625l1.015625 0.0625l0 0.953125l-3.1875 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875zm5.846283 -4.6875l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm5.9114075 0.84375q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm4.0892944 2.125l0 -3.328125q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.09375l-0.5 -0.03125l0 -0.96875l2.015625 0l0 0.890625q0.21875 -0.46875 0.671875 -0.765625q0.453125 -0.3125 1.046875 -0.3125q0.609375 0 1.140625 0.265625l0 1.796875l-1.109375 0.078125l0 -0.546875q0 -0.25 -0.125 -0.3125q-0.125 -0.046875 -0.328125 -0.046875q-0.46875 0 -0.78125 0.34375q-0.3125 0.328125 -0.3125 0.859375l0 2.78125l1.046875 0.046875l0 0.953125l-3.21875 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875z" fill-rule="nonzero"/><path fill="#ff9900" d="m-1.4698163E-5 123.44314l0 0c0 -2.968773 2.4066646 -5.375435 5.375436 -5.375435l97.186134 0c1.4256592 0 2.792923 0.5663376 3.8010101 1.5744247c1.0080872 1.0080948 1.5744324 2.3753586 1.5744324 3.8010101l0 21.501091c0 2.9687805 -2.4066696 5.3754425 -5.3754425 5.3754425l-97.186134 0c-2.9687712 0 -5.375436 -2.406662 -5.375436 -5.3754425z" fill-rule="evenodd"/><path fill="#000000" d="m14.727401 130.83994l2.999999 0q2.296875 0 2.296875 1.78125q0 0.984375 -0.984375 1.5q0.640625 0.15625 0.984375 0.59375q0.359375 0.421875 0.359375 1.09375q0 1.03125 -0.703125 1.578125q-0.6875 0.546875 -2.015625 0.546875l-2.999999 0l0 -0.859375l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.296875l0 -4.96875l-0.59375 -0.03125l0 -0.90625zm1.906249 3.859375l0 2.234375l1.0 0q1.375 0 1.375 -1.125q0 -0.546875 -0.34375 -0.828125q-0.34375 -0.28125 -0.984375 -0.28125l-1.046875 0zm0 -2.875l0 1.9375l0.78125 0q0.65625 0 0.96875 -0.265625q0.328125 -0.28125 0.328125 -0.765625q0 -0.5 -0.328125 -0.703125q-0.3125 -0.203125 -0.890625 -0.203125l-0.859375 0zm7.1182556 0.59375q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.43886 -5.296875l0.703125 0l0 1.125l1.40625 0l-0.109375 0.890625l-1.296875 0l0 2.8125q0 0.46875 0.15625 0.65625q0.171875 0.171875 0.53125 0.171875q0.359375 0 0.71875 -0.21875l0.328125 0.828125q-0.53125 0.375 -1.359375 0.375q-0.484375 0 -0.8125 -0.125q-0.328125 -0.125 -0.484375 -0.265625q-0.1562519 -0.15625 -0.2500019 -0.4375q-0.078125 -0.296875 -0.09375 -0.453125q-0.015625 -0.171875 -0.015625 -0.5l0 -2.84375l-0.75 0l0.109375 -0.78125q0.5 -0.03125 0.75 -0.34375q0.2656269 -0.328125 0.4687519 -0.890625zm2.8299408 6.484375l0 -0.859375l0.375 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -4.96875l-0.609375 -0.03125l0 -0.90625l1.921875 0l3.015625 4.90625l0.015625 0l0 -3.96875l-0.703125 -0.03125l0 -0.90625l2.609375 0l0 0.875l-0.390625 0.03125q-0.25 0.015625 -0.25 0.328125l0 5.859375l-1.25 0l-3.078125 -4.96875l-0.015625 0l0 4.03125l0.6875 0.03125l0 0.90625l-2.609375 0zm12.575226 0l-5.0625 0l0 -0.859375l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.296875l0 -4.96875l-0.609375 -0.03125l0 -0.90625l2.578125 0l0 0.875l-0.359375 0.03125q-0.3125 0.03125 -0.3125 0.34375l0 4.75l1.78125 0q0.140625 0 0.1875 -0.078125q0.046875 -0.078125 0.046875 -0.265625l0 -0.75l1.09375 0.078125l0 2.109375zm4.956833 -5.953125q-0.109375 -0.203125 -0.875 -0.203125q-0.984375 0 -1.5 0.671875q-0.5 0.671875 -0.5 1.9375q0 2.625 1.96875 2.625q0.03125 0 0.28125 0q0.25 0 0.46875 -0.0625q0.21875 -0.078125 0.265625 -0.140625q0.046875 -0.078125 0.046875 -0.234375l0 -0.859375l1.078125 0.078125l0 1.84375q-0.828125 0.4375 -2.109375 0.4375q-1.625 0 -2.484375 -0.90625q-0.859375 -0.921875 -0.859375 -2.734375q0 -1.0 0.28125 -1.75q0.296875 -0.75 0.78125 -1.171875q0.96875 -0.796875 2.28125 -0.796875q1.078125 0 2.015625 0.4375l0 1.8125l-1.09375 0.078125l0 -0.828125q0 -0.15625 -0.046875 -0.234375zm4.0401306 -1.640625l0 6.703125l0.59375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.34375l0 -5.25q0 -0.234375 -0.265625 -0.25l-0.359375 -0.03125l0 -0.875l1.90625 0zm6.050598 2.453125l0 3.953125q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.34375 0.015625l0 0.859375l-1.703125 0l0 -0.625l-0.03125 0q-0.53125 0.765625 -1.453125 0.765625q-1.09375 0 -1.625 -0.6875q-0.515625 -0.703125 -0.515625 -1.90625q0 -1.453125 0.703125 -2.25q0.703125 -0.8125 2.109375 -0.8125q0.90625 0 1.90625 0.375zm-1.234375 3.453125l0 -2.765625q-0.296875 -0.140625 -0.828125 -0.140625q-0.71875 0 -1.046875 0.578125q-0.3125 0.578125 -0.3125 1.515625q0 1.734375 1.109375 1.734375q0.46875 0 0.765625 -0.28125q0.3125 -0.28125 0.3125 -0.640625zm5.2765274 -2.859375q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21874237 0.15625 -0.21874237 0.390625q0 0.234375 0.078125 0.375q0.09374237 0.140625 0.26561737 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.2968674 0 -1.9531174 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.8906174 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.18749237 -0.0625 -0.42186737 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.4843674 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm4.806015 0q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm3.94664 -0.8125l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375 0.21875 -0.5625q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.5625q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875zm3.2324066 5.09375l0 -3.28125l-0.90625 0l0 -0.890625l0.90625 0l0 -0.40625q0 -1.046875 0.484375 -1.5q0.453125 -0.4375 1.3125 -0.4375q0.859375 0 1.53125 0.375l-0.3125 0.84375q-0.5 -0.265625 -0.953125 -0.265625q-0.453125 0 -0.625 0.21875q-0.15625 0.21875 -0.15625 0.671875l0 0.5l1.578125 0l0 0.890625l-1.578125 0l0 3.578125l0.890625 0.03125l0 0.859375l-2.828125 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.21875 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.208481 -4.171875l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375 0.21875 -0.5625q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.5625q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875zm5.2480316 0.765625q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625z" fill-rule="nonzero"/><path fill="#ff9900" d="m357.90506 65.907974l0 0c0 -2.968769 2.4066467 -5.375435 5.3754272 -5.375435l116.90268 0c1.4256592 0 2.7929077 0.5663376 3.8009949 1.5744286c1.0080872 1.008091 1.5744324 2.3753548 1.5744324 3.8010063l0 21.501099c0 2.968773 -2.4066467 5.375435 -5.3754272 5.375435l-116.90268 0c-2.9687805 0 -5.3754272 -2.406662 -5.3754272 -5.375435z" fill-rule="evenodd"/><path fill="#000000" d="m371.1282 73.194145l3.390625 0q2.578125 0 2.578125 2.015625q0 1.09375 -1.09375 1.671875q0.703125 0.1875 1.109375 0.671875q0.40625 0.46875 0.40625 1.234375q0 1.171875 -0.796875 1.78125q-0.78125 0.609375 -2.28125 0.609375l-3.375 0l0 -0.96875l0.390625 -0.046875q0.359375 -0.03125 0.359375 -0.328125l0 -5.578125l-0.6875 -0.03125l0 -1.03125zm2.140625 4.34375l0 2.515625l1.140625 0q1.546875 0 1.546875 -1.28125q0 -0.59375 -0.390625 -0.90625q-0.375 -0.328125 -1.109375 -0.328125l-1.1875 0zm0 -3.234375l0 2.171875l0.890625 0q0.75 0 1.09375 -0.296875q0.359375 -0.3125 0.359375 -0.859375q0 -0.5625 -0.359375 -0.78125q-0.34375 -0.234375 -1.0 -0.234375l-0.984375 0zm10.395935 1.078125l0 4.46875q0 0.1875 0.0625 0.265625q0.078125 0.0625 0.234375 0.078125l0.390625 0.03125l0 0.953125l-1.90625 0l0 -0.703125l-0.03125 0q-0.609375 0.859375 -1.65625 0.859375q-1.21875 0 -1.8125 -0.78125q-0.578125 -0.78125 -0.578125 -2.140625q0 -1.625 0.78125 -2.53125q0.796875 -0.921875 2.375 -0.921875q1.03125 0 2.140625 0.421875zm-1.390625 3.890625l0 -3.109375q-0.328125 -0.15625 -0.921875 -0.15625q-0.8125 0 -1.171875 0.65625q-0.359375 0.640625 -0.359375 1.703125q0 1.953125 1.25 1.953125q0.53125 0 0.859375 -0.3125q0.34375 -0.328125 0.34375 -0.734375zm5.9602966 -3.21875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm5.271393 -1.09375q0.90625 0 1.421875 0.4375q0.53125 0.4375 0.53125 1.234375q0 0.53125 -0.234375 0.9375q-0.21875 0.390625 -0.5625 0.625q-0.328125 0.21875 -0.8125 0.375q-0.796875 0.234375 -1.8125 0.234375q0.046875 0.640625 0.40625 1.03125q0.359375 0.390625 1.09375 0.390625q0.75 0 1.5 -0.53125l0.453125 0.96875q-0.25 0.21875 -0.8125 0.453125q-0.5625 0.21875 -1.296875 0.21875q-1.46875 0 -2.15625 -0.8125q-0.6875 -0.8125 -0.6875 -2.21875q0 -1.421875 0.78125 -2.375q0.78125 -0.96875 2.1875 -0.96875zm-0.546875 2.75q0.4375 -0.078125 0.8125 -0.34375q0.375 -0.28125 0.375 -0.65625q0 -0.734375 -0.71875 -0.734375q-0.671875 0 -1.03125 0.546875q-0.359375 0.53125 -0.40625 1.265625q0.53125 0 0.96875 -0.078125zm5.3549194 -3.078125l1.609375 4.828125l0.0625 0l1.6875 -5.1875l-0.734375 -0.046875l0 -1.03125l2.75 0l0 0.96875l-0.390625 0.046875q-0.15625 0.015625 -0.234375 0.09375q-0.0625 0.078125 -0.140625 0.265625l-2.265625 6.609375l-1.5625 0l-2.4375 -6.921875l-0.6875 -0.03125l0 -1.03125l3.09375 0l0 0.96875l-0.390625 0.046875q-0.375 0.015625 -0.375 0.28125q0 0.0625 0.015625 0.140625zm8.004303 0.515625l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm6.0520325 1.9375q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm4.4276733 -0.90625l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375zm5.6614075 6.1875q1.3125 0 1.3125 -2.109375q0 -1.078125 -0.296875 -1.625q-0.296875 -0.546875 -0.984375 -0.546875q-0.6875 0 -1.015625 0.53125q-0.328125 0.515625 -0.328125 1.4375q0 1.6875 0.625 2.125q0.28125 0.1875 0.6875 0.1875zm-2.765625 -2.15625q0 -0.875 0.265625 -1.53125q0.265625 -0.65625 0.71875 -1.0q0.84375 -0.65625 1.875 -0.65625q0.71875 0 1.21875 0.234375q0.5 0.234375 0.78125 0.546875q0.28125 0.296875 0.46875 0.890625q0.203125 0.578125 0.203125 1.359375q0 1.65625 -0.8125 2.515625q-0.796875 0.859375 -2.046875 0.859375q-1.25 0 -1.96875 -0.8125q-0.703125 -0.8125 -0.703125 -2.40625zm6.273529 -2.046875l0 -0.953125l2.03125 0l0 0.78125q0.296875 -0.453125 0.8125 -0.703125q0.53125 -0.265625 1.125 -0.265625q0.90625 0 1.40625 0.53125q0.5 0.515625 0.5 1.5625l0 3.125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -2.46875q0 -0.65625 -0.203125 -0.96875q-0.203125 -0.328125 -0.78125 -0.328125q-0.5625 0 -0.90625 0.359375q-0.328125 0.359375 -0.328125 0.84375l0 2.90625l0.671875 0.046875l0 0.953125l-2.84375 0l0 -0.90625l0.390625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875l0 -3.328125q0 -0.359375 -0.296875 -0.375l-0.5 -0.03125zm8.881317 3.734375l0 -5.453125l-0.921875 0q-0.28125 0 -0.28125 0.375l0 0.5625l-1.21875 -0.078125l0 -2.046875l6.265625 0l0 2.046875l-1.21875 0.078125l0 -0.5625q0 -0.203125 -0.0625 -0.28125q-0.0625 -0.09375 -0.28125 -0.09375l-0.828125 0l0 5.734375l0.953125 0.046875l0 1.015625l-3.390625 0l0 -0.96875l0.640625 -0.046875q0.34375 -0.03125 0.34375 -0.328125zm9.807922 -4.453125l0 4.46875q0 0.1875 0.0625 0.265625q0.078125 0.0625 0.234375 0.078125l0.390625 0.03125l0 0.953125l-1.90625 0l0 -0.703125l-0.03125 0q-0.609375 0.859375 -1.65625 0.859375q-1.21875 0 -1.8125 -0.78125q-0.578125 -0.78125 -0.578125 -2.140625q0 -1.625 0.78125 -2.53125q0.796875 -0.921875 2.375 -0.921875q1.03125 0 2.140625 0.421875zm-1.390625 3.890625l0 -3.109375q-0.328125 -0.15625 -0.921875 -0.15625q-0.8125 0 -1.171875 0.65625q-0.359375 0.640625 -0.359375 1.703125q0 1.953125 1.25 1.953125q0.53125 0 0.859375 -0.3125q0.34375 -0.328125 0.34375 -0.734375zm5.9602966 -3.21875q-0.296875 -0.109375 -0.703125 -0.109375q-0.40625 0 -0.65625 0.1875q-0.25 0.1875 -0.25 0.46875q0 0.265625 0.09375 0.421875q0.09375 0.140625 0.28125 0.25q0.296875 0.15625 0.71875 0.265625q0.421875 0.109375 0.625 0.1875q0.203125 0.0625 0.5 0.21875q0.3125 0.15625 0.46875 0.328125q0.421875 0.453125 0.421875 1.140625q0 0.90625 -0.65625 1.421875q-0.640625 0.5 -1.65625 0.5q-1.46875 0 -2.203125 -0.375l0 -1.671875l1.078125 -0.078125l0 0.578125q0 0.53125 1.0 0.53125q1.015625 0 1.015625 -0.734375q0 -0.265625 -0.171875 -0.4375q-0.171875 -0.171875 -0.34375 -0.21875q-0.171875 -0.0625 -0.40625 -0.125q-0.234375 -0.0625 -0.453125 -0.125q-0.21875 -0.0625 -0.484375 -0.15625q-0.25 -0.109375 -0.5625 -0.296875q-0.609375 -0.390625 -0.609375 -1.3125q0 -0.9375 0.65625 -1.4375q0.65625 -0.515625 1.65625 -0.515625q1.0 0 1.984375 0.484375l0 1.4375l-1.078125 0.078125l0 -0.5q0 -0.296875 -0.265625 -0.40625zm3.0057678 3.78125l0 -5.84375q0 -0.1875 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -1.0l2.140625 0l0 5.203125q0.65625 -0.0625 1.140625 -0.40625q0.5 -0.34375 0.671875 -0.921875q0.046875 -0.109375 0.046875 -0.1875q0 -0.203125 -0.234375 -0.203125l-0.453125 -0.015625l0 -0.9375l2.484375 0l0 0.96875l-0.578125 0.015625q-0.15625 1.234375 -1.0625 1.9375l1.1875 2.109375l0.71875 0.046875l0 0.953125l-1.75 0l-1.359375 -2.5q-0.40625 0.109375 -0.8125 0.171875l0 1.328125l0 0l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.171875 -0.015625 0.25 -0.09375q0.09375 -0.078125 0.09375 -0.296875zm6.343933 0l1.9375 -5.59375l-0.765625 -0.078125l0 -0.96875l2.75 0l2.453125 6.921875l0.015625 0l0.6875 0.046875l0 1.015625l-3.109375 0l0 -0.96875l0.46875 -0.046875q0.203125 -0.015625 0.265625 -0.078125q0.078125 -0.078125 0.015625 -0.28125l-0.25 -0.734375l-2.8125 0l-0.359375 1.046875l0.71875 0.046875l0 1.015625l-2.765625 0l0 -0.96875l0.390625 -0.046875q0.25 -0.015625 0.359375 -0.328125zm3.0625 -5.296875l-1.109375 3.421875l2.21875 0l-1.078125 -3.421875l-0.03125 0zm4.8915405 7.78125l0 -5.8125q0 -0.203125 -0.0625 -0.28125q-0.0625 -0.078125 -0.234375 -0.078125l-0.5 -0.03125l0 -0.96875l2.03125 0l0 0.78125q0.1875 -0.359375 0.6875 -0.65625q0.5 -0.3125 1.203125 -0.3125q2.171875 0 2.171875 3.046875q0 1.625 -0.71875 2.5q-0.703125 0.859375 -1.90625 0.859375q-0.71875 0 -1.28125 -0.359375l0 1.65625l0.96875 0.046875l0 0.953125l-3.09375 0l0 -0.90625l0.390625 -0.046875q0.1875 -0.015625 0.265625 -0.09375q0.078125 -0.078125 0.078125 -0.296875zm3.8125 -4.1875q0 -2.0 -1.21875 -2.0q-0.5 0 -0.859375 0.3125q-0.34375 0.296875 -0.34375 0.71875l0 2.75q0.453125 0.3125 1.109375 0.3125q0.65625 0 0.984375 -0.59375q0.328125 -0.609375 0.328125 -1.5zm4.5035706 -2.984375l0 5.03125l0.6875 0.046875l0 0.953125l-2.859375 0l0 -0.90625l0.40625 -0.046875q0.34375 -0.03125 0.34375 -0.375l0 -3.34375q0 -0.203125 -0.0625 -0.265625q-0.0625 -0.078125 -0.234375 -0.078125l-0.421875 -0.03125l0 -0.984375l2.140625 0zm-1.515625 -1.03125q-0.25 -0.25 -0.25 -0.640625q0 -0.390625 0.25 -0.640625q0.265625 -0.265625 0.671875 -0.265625q0.40625 0 0.65625 0.265625q0.265625 0.25 0.265625 0.640625q0 0.390625 -0.265625 0.640625q-0.25 0.234375 -0.65625 0.234375q-0.40625 0 -0.671875 -0.234375z" fill-rule="nonzero"/><path fill="#ff9900" d="m473.916 123.446304l0 0c0 -2.968773 2.4066772 -5.375435 5.375458 -5.375435l74.760925 0c1.4256592 0 2.7929077 0.5663376 3.8010254 1.5744247c1.0080566 1.0080872 1.5744019 2.375351 1.5744019 3.8010101l0 21.501099c0 2.9687653 -2.4066772 5.3754272 -5.3754272 5.3754272l-74.760925 0l0 0c-2.9687805 0 -5.375458 -2.406662 -5.375458 -5.3754272z" fill-rule="evenodd"/><path fill="#000000" d="m488.88403 131.34435q1.109375 0 1.859375 0.78125q0.75 0.78125 0.75 2.390625q0 1.609375 -0.75 2.4375q-0.734375 0.828125 -2.0 0.828125q-1.265625 0 -1.984375 -0.8125q-0.71875 -0.828125 -0.71875 -2.40625q0 -0.859375 0.25 -1.5q0.25 -0.65625 0.65625 -1.0q0.84375 -0.71875 1.9375 -0.71875zm-1.671875 3.171875q0 2.3125 1.59375 2.3125q0.859375 0 1.203125 -0.703125q0.328125 -0.625 0.328125 -1.53125q0 -0.8125 -0.28125 -1.484375q-0.171875 -0.375 -0.5 -0.59375q-0.3125 -0.21875 -0.828125 -0.21875q-0.5 0 -0.875 0.34375q-0.375 0.328125 -0.515625 0.8125q-0.125 0.46875 -0.125 1.0625zm5.2257385 2.859375l0 -5.359375q0 -0.203125 -0.234375 -0.21875l-0.328125 -0.015625l0 -0.78125l1.640625 0l0 2.21875q0.421875 -0.390625 1.125 -0.390625q0.890625 0 1.390625 0.59375q0.515625 0.59375 0.515625 1.65625q0 2.703125 -2.25 2.703125q-0.484375 0 -1.03125 -0.125q-0.53125 -0.109375 -0.828125 -0.28125zm1.078125 -3.015625l0 2.40625q0.359375 0.1875 0.828125 0.1875q1.0625 0 1.0625 -1.71875q0 -1.578125 -1.046875 -1.578125q-0.34375 0 -0.59375 0.203125q-0.25 0.1875 -0.25 0.5zm5.327057 -1.390625l0 4.90625q0 0.75 -0.234375 1.21875q-0.3125 0.5625 -1.25 0.5625q-0.671875 0 -1.203125 -0.328125l0.34375 -0.78125q0.359375 0.203125 0.65625 0.203125q0.3125 0 0.4375 -0.1875q0.140625 -0.1875 0.140625 -0.640625l0 -3.890625q0 -0.15625 -0.046875 -0.203125q-0.046875 -0.0625 -0.1875 -0.078125l-0.328125 -0.015625l0 -0.765625l1.671875 0zm-1.1875 -0.8125q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.5q0.203125 -0.203125 0.515625 -0.203125q0.328125 0 0.53125 0.203125q0.203125 0.203125 0.203125 0.5q0 0.296875 -0.203125 0.484375q-0.203125 0.1875 -0.53125 0.1875q-0.3125 0 -0.515625 -0.1875zm4.546112 0.671875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.7335815 0.3125q0 0.78125 0.296875 1.21875q0.296875 0.4375 0.859375 0.4375q0.5625 0 1.125 -0.390625l0.390625 0.71875q-0.65625 0.53125 -1.65625 0.53125q-1.0 0 -1.578125 -0.625q-0.5625 -0.625 -0.5625 -1.859375q0 -1.234375 0.65625 -1.84375q0.65625 -0.625 1.484375 -0.625q0.828125 0 1.53125 0.375l0 1.21875l-0.859375 0.0625l0 -0.453125q0 -0.25 -0.1875 -0.3125q-0.171875 -0.0625 -0.375 -0.0625q-1.125 0 -1.125 1.609375zm4.2998047 -3.296875l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm2.485443 5.6875l0 -0.75l0.3125 -0.03125q0.265625 -0.03125 0.265625 -0.265625l0 -4.328125l-0.53125 -0.03125l0 -0.796875l2.5625 0q1.375 0 2.140625 0.734375q0.765625 0.71875 0.765625 2.140625q0 0.890625 -0.234375 1.546875q-0.234375 0.65625 -0.625 1.03125q-0.8125 0.75 -2.0 0.75l-2.65625 0zm1.71875 -5.296875l0 4.40625l0.953125 0q0.796875 0 1.25 -0.578125q0.453125 -0.578125 0.453125 -1.6875q0 -2.140625 -1.828125 -2.140625l-0.828125 0zm6.828186 0.46875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.6398315 -2.984375l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm4.8292236 0.859375q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.7335815 0.3125q0 0.78125 0.296875 1.21875q0.296875 0.4375 0.859375 0.4375q0.5625 0 1.125 -0.390625l0.390625 0.71875q-0.65625 0.53125 -1.65625 0.53125q-1.0 0 -1.578125 -0.625q-0.5625 -0.625 -0.5625 -1.859375q0 -1.234375 0.65625 -1.84375q0.65625 -0.625 1.484375 -0.625q0.828125 0 1.53125 0.375l0 1.21875l-0.859375 0.0625l0 -0.453125q0 -0.25 -0.1875 -0.3125q-0.171875 -0.0625 -0.375 -0.0625q-1.125 0 -1.125 1.609375zm4.2998047 -3.296875l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm4.439392 5.0q1.015625 0 1.015625 -1.640625q0 -0.828125 -0.234375 -1.25q-0.21875 -0.4375 -0.765625 -0.4375q-0.53125 0 -0.78125 0.421875q-0.25 0.40625 -0.25 1.109375q0 1.3125 0.484375 1.65625q0.21875 0.140625 0.53125 0.140625zm-2.140625 -1.65625q0 -0.703125 0.203125 -1.203125q0.203125 -0.515625 0.546875 -0.78125q0.65625 -0.5 1.46875 -0.5q0.5625 0 0.9375 0.1875q0.390625 0.171875 0.609375 0.40625q0.21875 0.234375 0.375 0.6875q0.15625 0.453125 0.15625 1.078125q0 1.28125 -0.625 1.953125q-0.625 0.65625 -1.609375 0.65625q-0.96875 0 -1.515625 -0.625q-0.546875 -0.640625 -0.546875 -1.859375zm5.503357 1.296875l0 -2.578125q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.390625 -0.03125l0 -0.75l1.578125 0l0 0.6875q0.15625 -0.359375 0.515625 -0.59375q0.359375 -0.234375 0.8125 -0.234375q0.46875 0 0.890625 0.203125l0 1.390625l-0.875 0.0625l0 -0.421875q0 -0.1875 -0.09375 -0.234375q-0.09375 -0.046875 -0.25 -0.046875q-0.375 0 -0.609375 0.265625q-0.234375 0.265625 -0.234375 0.671875l0 2.15625l0.8125 0.046875l0 0.734375l-2.5 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375z" fill-rule="nonzero"/><path fill="#ff9900" d="m122.40648 123.44314l0 0c0 -2.968773 2.4066696 -5.375435 5.375435 -5.375435l127.73733 0c1.4256439 0 2.792923 0.5663376 3.8010101 1.5744247c1.0080872 1.0080948 1.5744324 2.3753586 1.5744324 3.8010101l0 21.501091c0 2.9687805 -2.4066772 5.3754425 -5.3754425 5.3754425l-127.73733 0c-2.9687653 0 -5.375435 -2.406662 -5.375435 -5.3754425z" fill-rule="evenodd"/><path fill="#000000" d="m137.1603 130.83994l3.0 0q2.296875 0 2.296875 1.78125q0 0.984375 -0.984375 1.5q0.640625 0.15625 0.984375 0.59375q0.359375 0.421875 0.359375 1.09375q0 1.03125 -0.703125 1.578125q-0.6875 0.546875 -2.015625 0.546875l-3.0 0l0 -0.859375l0.359375 -0.03125q0.296875 -0.03125 0.296875 -0.296875l0 -4.96875l-0.59375 -0.03125l0 -0.90625zm1.90625 3.859375l0 2.234375l1.0 0q1.375 0 1.375 -1.125q0 -0.546875 -0.34375 -0.828125q-0.34375 -0.28125 -0.984375 -0.28125l-1.046875 0zm0 -2.875l0 1.9375l0.78125 0q0.65625 0 0.96875 -0.265625q0.328125 -0.28125 0.328125 -0.765625q0 -0.5 -0.328125 -0.703125q-0.3125 -0.203125 -0.890625 -0.203125l-0.859375 0zm7.1182556 0.59375q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm5.438858 -5.296875l0.703125 0l0 1.125l1.40625 0l-0.109375 0.890625l-1.296875 0l0 2.8125q0 0.46875 0.15625 0.65625q0.171875 0.171875 0.53125 0.171875q0.359375 0 0.71875 -0.21875l0.328125 0.828125q-0.53125 0.375 -1.359375 0.375q-0.484375 0 -0.8125 -0.125q-0.328125 -0.125 -0.484375 -0.265625q-0.15625 -0.15625 -0.25 -0.4375q-0.078125 -0.296875 -0.09375 -0.453125q-0.015625 -0.171875 -0.015625 -0.5l0 -2.84375l-0.75 0l0.109375 -0.78125q0.5 -0.03125 0.75 -0.34375q0.265625 -0.328125 0.46875 -0.890625zm9.126816 2.953125q0 1.375 -0.578125 2.265625q-0.578125 0.875 -1.6875 1.171875l0 0.03125q0.828125 0.046875 1.578125 0.484375q0.53125 0.328125 1.234375 0.328125q0.171875 0 0.328125 -0.015625l-0.125 1.03125q-0.140625 0 -0.28125 0l-1.0625 -0.171875q-0.671875 -0.234375 -1.3125 -0.53125q-0.71875 -0.34375 -1.328125 -0.34375q-0.3125 0 -0.609375 0.0625l0.15625 -0.765625q-2.53125 -0.328125 -2.53125 -3.546875q0 -0.96875 0.28125 -1.703125q0.28125 -0.75 0.765625 -1.15625q0.9375 -0.828125 2.203125 -0.828125q1.265625 0 2.109375 0.90625q0.859375 0.890625 0.859375 2.78125zm-4.890625 -0.046875q0 2.640625 1.828125 2.640625q0.984375 0 1.390625 -0.8125q0.34375 -0.71875 0.34375 -1.75q0 -0.921875 -0.328125 -1.703125q-0.171875 -0.421875 -0.546875 -0.65625q-0.359375 -0.25 -0.953125 -0.25q-0.578125 0 -1.0 0.390625q-0.421875 0.375 -0.578125 0.921875q-0.15625 0.53125 -0.15625 1.21875zm11.325821 2.734375l0 0.84375l-1.734375 0l0 -0.6875q-0.578125 0.828125 -1.71875 0.828125q-1.65625 0 -1.65625 -1.84375l0 -2.46875q0 -0.28125 -0.265625 -0.296875l-0.34375 -0.015625l0 -0.875l1.875 0l0 3.375q0 0.5625 0.15625 0.859375q0.171875 0.28125 0.671875 0.28125q0.5 0 0.796875 -0.296875q0.3125 -0.3125 0.3125 -0.75l0 -2.25q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.34375 -0.015625l0 -0.875l1.875 0l0 4.171875q0 0.171875 0.046875 0.234375q0.0625 0.0625 0.21875 0.078125l0.375 0.03125zm3.3291626 -4.671875q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm5.7514343 -1.46875q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm3.0899658 -1.9375l0.703125 0l0 1.125l1.40625 0l-0.109375 0.890625l-1.296875 0l0 2.8125q0 0.46875 0.15625 0.65625q0.171875 0.171875 0.53125 0.171875q0.359375 0 0.71875 -0.21875l0.328125 0.828125q-0.53125 0.375 -1.359375 0.375q-0.484375 0 -0.8125 -0.125q-0.328125 -0.125 -0.484375 -0.265625q-0.15625 -0.15625 -0.25 -0.4375q-0.078125 -0.296875 -0.09375 -0.453125q-0.015625 -0.171875 -0.015625 -0.5l0 -2.84375l-0.75 0l0.109375 -0.78125q0.5 -0.03125 0.75 -0.34375q0.265625 -0.328125 0.46875 -0.890625zm4.767441 1.125l0 4.46875l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.3125 -0.03125 0.3125 -0.34375l0 -2.953125q0 -0.1875 -0.0625 -0.25q-0.0625 -0.0625 -0.203125 -0.0625l-0.375 -0.015625l0 -0.890625l1.90625 0zm-1.34375 -0.921875q-0.21875 -0.21875 -0.21875 -0.5625q0 -0.34375 0.21875 -0.5625q0.21875 -0.234375 0.578125 -0.234375q0.375 0 0.59375 0.234375q0.234375 0.21875 0.234375 0.5625q0 0.34375 -0.234375 0.5625q-0.21875 0.21875 -0.59375 0.21875q-0.359375 0 -0.578125 -0.21875zm5.0136566 5.5q1.171875 0 1.171875 -1.875q0 -0.953125 -0.265625 -1.4375q-0.265625 -0.484375 -0.875 -0.484375q-0.609375 0 -0.90625 0.46875q-0.28125 0.46875 -0.28125 1.28125q0 1.5 0.546875 1.875q0.25 0.171875 0.609375 0.171875zm-2.4375 -1.90625q0 -0.78125 0.234375 -1.359375q0.234375 -0.59375 0.625 -0.890625q0.75 -0.578125 1.671875 -0.578125q0.640625 0 1.078125 0.203125q0.4375 0.203125 0.6875 0.484375q0.25 0.265625 0.421875 0.78125q0.1875 0.515625 0.1875 1.21875q0 1.46875 -0.71875 2.234375q-0.703125 0.75 -1.828125 0.75q-1.109375 0 -1.734375 -0.71875q-0.625 -0.71875 -0.625 -2.125zm5.5484314 -1.8125l0 -0.859375l1.8125 0l0 0.6875q0.265625 -0.390625 0.71875 -0.609375q0.46875 -0.234375 1.0 -0.234375q0.8125 0 1.25 0.46875q0.453125 0.453125 0.453125 1.375l0 2.78125l0.59375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.21875 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.203125q0 -0.578125 -0.1875 -0.859375q-0.171875 -0.296875 -0.6875 -0.296875q-0.5 0 -0.796875 0.328125q-0.296875 0.3125 -0.296875 0.75l0 2.578125l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.171875 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.953125q0 -0.328125 -0.265625 -0.328125l-0.453125 -0.03125zm6.831314 3.3125l1.71875 -4.96875l-0.671875 -0.0625l0 -0.875l2.4375 0l2.1875 6.15625l0 0l0.609375 0.03125l0 0.90625l-2.75 0l0 -0.859375l0.40625 -0.03125q0.1875 -0.03125 0.25 -0.09375q0.0625 -0.0625 0 -0.234375l-0.21875 -0.65625l-2.5 0l-0.3125 0.9375l0.640625 0.03125l0 0.90625l-2.46875 0l0 -0.859375l0.359375 -0.03125q0.21875 -0.03125 0.3125 -0.296875zm2.71875 -4.703125l-0.984375 3.03125l1.984375 0l-0.96875 -3.03125l-0.03125 0zm3.8982544 1.390625l0 -0.859375l1.8125 0l0 0.6875q0.265625 -0.390625 0.71875 -0.609375q0.46875 -0.234375 1.0 -0.234375q0.8125 0 1.25 0.46875q0.453125 0.453125 0.453125 1.375l0 2.78125l0.59375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.359375 -0.03125q0.15625 -0.015625 0.21875 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.203125q0 -0.578125 -0.1875 -0.859375q-0.171875 -0.296875 -0.6875 -0.296875q-0.5 0 -0.796875 0.328125q-0.296875 0.3125 -0.296875 0.75l0 2.578125l0.609375 0.03125l0 0.859375l-2.53125 0l0 -0.8125l0.34375 -0.03125q0.171875 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625l0 -2.953125q0 -0.328125 -0.265625 -0.328125l-0.453125 -0.03125zm9.253189 -0.046875q-0.25 -0.09375 -0.625 -0.09375q-0.359375 0 -0.578125 0.171875q-0.21875 0.15625 -0.21875 0.390625q0 0.234375 0.078125 0.375q0.09375 0.140625 0.265625 0.234375q0.265625 0.140625 0.625 0.234375q0.375 0.09375 0.5625 0.15625q0.1875 0.0625 0.453125 0.203125q0.265625 0.140625 0.40625 0.296875q0.375 0.390625 0.375 1.015625q0 0.796875 -0.578125 1.25q-0.578125 0.453125 -1.484375 0.453125q-1.296875 0 -1.953125 -0.328125l0 -1.484375l0.953125 -0.078125l0 0.515625q0 0.46875 0.890625 0.46875q0.90625 0 0.90625 -0.65625q0 -0.234375 -0.15625 -0.375q-0.15625 -0.15625 -0.3125 -0.203125q-0.140625 -0.0625 -0.359375 -0.109375q-0.203125 -0.046875 -0.40625 -0.09375q-0.1875 -0.0625 -0.421875 -0.15625q-0.21875 -0.09375 -0.5 -0.265625q-0.546875 -0.34375 -0.546875 -1.171875q0 -0.828125 0.578125 -1.265625q0.59375 -0.453125 1.484375 -0.453125q0.890625 0 1.765625 0.421875l0 1.28125l-0.953125 0.078125l0 -0.453125q0 -0.265625 -0.25 -0.359375zm5.54039 -0.59375l1.03125 0l1.109375 3.953125l0.03125 0l0.71875 -3.28125l-0.546875 -0.03125l0 -0.859375l2.140625 0l0 0.8125l-0.34375 0.03125q-0.21875 0.015625 -0.296875 0.25l-1.0 4.265625l-1.546875 0l-0.859375 -3.171875l-0.046875 0l-0.890625 3.171875l-1.578125 0l-1.078125 -4.25q-0.046875 -0.15625 -0.109375 -0.203125q-0.0625 -0.046875 -0.1875 -0.0625l-0.3125 -0.03125l0 -0.8125l2.4375 0l0 0.859375l-0.59375 0.03125l0.734375 3.265625l0.015625 0l1.171875 -3.9375zm7.6280518 -0.375q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625zm6.938858 -4.328125q0.796875 0 1.265625 0.390625q0.46875 0.390625 0.46875 1.09375q0 0.46875 -0.203125 0.828125q-0.203125 0.34375 -0.5 0.546875q-0.296875 0.203125 -0.734375 0.328125q-0.703125 0.21875 -1.609375 0.21875q0.046875 0.5625 0.359375 0.90625q0.3125 0.34375 0.96875 0.34375q0.671875 0 1.328125 -0.46875l0.40625 0.875q-0.203125 0.1875 -0.71875 0.390625q-0.5 0.203125 -1.15625 0.203125q-1.296875 0 -1.90625 -0.71875q-0.609375 -0.71875 -0.609375 -1.96875q0 -1.265625 0.6875 -2.109375q0.703125 -0.859375 1.953125 -0.859375zm-0.484375 2.4375q0.390625 -0.078125 0.71875 -0.3125q0.328125 -0.25 0.328125 -0.578125q0 -0.640625 -0.640625 -0.640625q-0.59375 0 -0.921875 0.46875q-0.3125 0.46875 -0.34375 1.140625q0.46875 -0.015625 0.859375 -0.078125zm3.6264343 1.890625l0 -2.953125q0 -0.171875 -0.0625 -0.234375q-0.0625 -0.078125 -0.203125 -0.09375l-0.453125 -0.03125l0 -0.859375l1.796875 0l0 0.796875q0.203125 -0.40625 0.59375 -0.671875q0.40625 -0.28125 0.9375 -0.28125q0.53125 0 1.015625 0.234375l0 1.578125l-1.0 0.078125l0 -0.484375q0 -0.21875 -0.109375 -0.265625q-0.109375 -0.0625 -0.28125 -0.0625q-0.421875 0 -0.703125 0.3125q-0.265625 0.296875 -0.265625 0.765625l0 2.46875l0.921875 0.046875l0 0.84375l-2.84375 0l0 -0.8125l0.34375 -0.03125q0.15625 -0.015625 0.234375 -0.078125q0.078125 -0.078125 0.078125 -0.265625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m191.64935 35.244095l0 25.29134" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64935 35.244095l0 19.29134" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m189.99763 54.535435l1.6517181 4.5380974l1.6517334 -4.5380974z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m191.64592 92.78188l0 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64592 92.78188l0 19.291344" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m189.99419 112.07323l1.6517334 4.5380936l1.6517334 -4.5380936z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m421.7318 92.78451l94.92914 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.73184 92.78451l89.13135 23.746681" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m510.438 118.12725l4.810364 -0.42775726l-3.9599304 -2.7643585z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m54.16782 92.78188l-0.1889801 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m54.16782 92.78188l-0.14414597 19.291504" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m52.371986 112.06105l1.6177788 4.550316l1.6855965 -4.525635z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m191.64935 35.244095l-137.48032 25.29134" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64935 35.244095l-131.57933 24.205776" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m59.771175 57.825397l-4.16436 2.4455376l4.762047 0.80340576z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m191.64935 35.244095l230.07874 25.29134" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.64935 35.244095l224.11469 24.635738" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m415.58356 61.521675l4.6914062 -1.1459808l-4.3304443 -2.137703z" fill-rule="evenodd"/><path fill="#ff9900" d="m281.3436 123.44633l0 0c0 -2.968773 2.4066772 -5.375435 5.3754272 -5.375435l78.225525 0c1.4256592 0 2.7929077 0.5663376 3.8009949 1.5744324c1.0080872 1.0080872 1.5744324 2.375351 1.5744324 3.8010025l0 21.501106c0 2.9687653 -2.4066467 5.3754272 -5.3754272 5.3754272l-78.225525 0c-2.96875 0 -5.3754272 -2.406662 -5.3754272 -5.3754272z" fill-rule="evenodd"/><path fill="#000000" d="m293.84845 136.56313l0 -4.28125l-0.53125 -0.03125l0 -0.796875l2.265625 0l0 0.75l-0.3125 0.03125q-0.265625 0.03125 -0.265625 0.296875l0 4.296875l0.53125 0.03125l0 0.796875l-2.265625 0l0 -0.75l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375zm2.238739 -2.84375l0 -0.75l1.578125 0l0 0.609375q0.515625 -0.75 1.421875 -0.75q0.921875 0 1.234375 0.75q0.53125 -0.75 1.421875 -0.75q0.671875 0 1.03125 0.40625q0.359375 0.40625 0.359375 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.21875l0 -1.9375q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.53125 -0.25q-0.421875 0 -0.65625 0.265625q-0.234375 0.25 -0.234375 0.6875l0 2.234375l0.515625 0.03125l0 0.75l-2.203125 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -1.921875q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.546875 -0.25q-0.421875 0 -0.65625 0.265625q-0.21875 0.265625 -0.21875 0.671875l0 2.25l0.5 0.03125l0 0.75l-2.1875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 0l0 -2.578125q0 -0.15625 -0.046875 -0.21875q-0.046875 -0.0625 -0.1875 -0.0625l-0.390625 -0.03125zm12.3281555 -0.5625l0 3.46875q0 0.140625 0.046875 0.203125q0.046875 0.046875 0.1875 0.0625l0.296875 0.015625l0 0.75l-1.484375 0l0 -0.546875l-0.03125 0q-0.46875 0.671875 -1.28125 0.671875q-0.953125 0 -1.40625 -0.609375q-0.453125 -0.609375 -0.453125 -1.65625q0 -1.28125 0.609375 -1.984375q0.625 -0.703125 1.84375 -0.703125q0.796875 0 1.671875 0.328125zm-1.078125 3.015625l0 -2.421875q-0.265625 -0.109375 -0.71875 -0.109375q-0.625 0 -0.90625 0.5q-0.28125 0.5 -0.28125 1.328125q0 1.515625 0.96875 1.515625q0.40625 0 0.671875 -0.25q0.265625 -0.25 0.265625 -0.5625zm6.3203125 1.46875q-0.078125 1.046875 -0.59375 1.53125q-0.515625 0.484375 -1.609375 0.484375q-1.078125 0 -1.703125 -0.3125l0 -1.296875l0.890625 -0.078125l0 0.453125q0 0.25 0.234375 0.328125q0.234375 0.09375 0.640625 0.09375q0.59375 0 0.828125 -0.359375q0.25 -0.359375 0.25 -1.0l0 -0.515625q-0.4375 0.515625 -1.234375 0.515625q-0.484375 0 -0.859375 -0.1875q-0.359375 -0.1875 -0.5625 -0.5q-0.40625 -0.640625 -0.40625 -1.484375q0 -1.234375 0.65625 -1.859375q0.671875 -0.625 1.765625 -0.625q0.921875 0 1.734375 0.34375l0 3.453125q0 0.796875 -0.03125 1.015625zm-1.0625 -1.890625l0 -2.0q-0.328125 -0.109375 -0.734375 -0.109375q-0.578125 0 -0.890625 0.421875q-0.296875 0.40625 -0.296875 1.125q0 1.484375 0.96875 1.484375q0.40625 0 0.671875 -0.25q0.28125 -0.25 0.28125 -0.671875zm4.3401184 -2.921875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm6.3585815 -2.515625q-0.09375 -0.171875 -0.765625 -0.171875q-0.859375 0 -1.3125 0.59375q-0.4375 0.578125 -0.4375 1.6875q0 2.3125 1.71875 2.3125q0.03125 0 0.25 0q0.21875 0 0.40625 -0.0625q0.203125 -0.0625 0.234375 -0.125q0.046875 -0.0625 0.046875 -0.203125l0 -0.75l0.953125 0.0625l0 1.609375q-0.734375 0.390625 -1.859375 0.390625q-1.421875 0 -2.171875 -0.796875q-0.75 -0.8125 -0.75 -2.390625q0 -0.890625 0.25 -1.53125q0.25 -0.65625 0.6875 -1.015625q0.828125 -0.703125 1.984375 -0.703125q0.9375 0 1.765625 0.375l0 1.59375l-0.953125 0.0625l0 -0.734375q0 -0.140625 -0.046875 -0.203125zm3.5374146 -1.4375l0 5.875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -4.609375q0 -0.203125 -0.234375 -0.21875l-0.328125 -0.015625l0 -0.78125l1.671875 0zm5.29541 2.15625l0 3.46875q0 0.140625 0.046875 0.203125q0.046875 0.046875 0.1875 0.0625l0.296875 0.015625l0 0.75l-1.484375 0l0 -0.546875l-0.03125 0q-0.46875 0.671875 -1.28125 0.671875q-0.953125 0 -1.40625 -0.609375q-0.453125 -0.609375 -0.453125 -1.65625q0 -1.28125 0.609375 -1.984375q0.625 -0.703125 1.84375 -0.703125q0.796875 0 1.671875 0.328125zm-1.078125 3.015625l0 -2.421875q-0.265625 -0.109375 -0.71875 -0.109375q-0.625 0 -0.90625 0.5q-0.28125 0.5 -0.28125 1.328125q0 1.515625 0.96875 1.515625q0.40625 0 0.671875 -0.25q0.265625 -0.25 0.265625 -0.5625zm4.617157 -2.5q-0.21875 -0.078125 -0.546875 -0.078125q-0.3125 0 -0.5 0.140625q-0.1875 0.140625 -0.1875 0.359375q0 0.203125 0.0625 0.328125q0.078125 0.109375 0.21875 0.1875q0.234375 0.125 0.5625 0.21875q0.328125 0.078125 0.484375 0.140625q0.15625 0.046875 0.390625 0.171875q0.25 0.125 0.359375 0.265625q0.328125 0.34375 0.328125 0.875q0 0.703125 -0.515625 1.109375q-0.5 0.390625 -1.28125 0.390625q-1.140625 0 -1.71875 -0.296875l0 -1.296875l0.84375 -0.0625l0 0.453125q0 0.40625 0.78125 0.40625q0.78125 0 0.78125 -0.5625q0 -0.21875 -0.140625 -0.34375q-0.125 -0.125 -0.265625 -0.171875q-0.125 -0.046875 -0.3125 -0.09375q-0.171875 -0.046875 -0.34375 -0.09375q-0.171875 -0.046875 -0.375 -0.125q-0.203125 -0.078125 -0.4375 -0.234375q-0.484375 -0.3125 -0.484375 -1.03125q0 -0.71875 0.515625 -1.109375q0.515625 -0.390625 1.296875 -0.390625q0.78125 0 1.546875 0.375l0 1.109375l-0.84375 0.0625l0 -0.390625q0 -0.234375 -0.21875 -0.3125zm4.2070007 0q-0.21875 -0.078125 -0.546875 -0.078125q-0.3125 0 -0.5 0.140625q-0.1875 0.140625 -0.1875 0.359375q0 0.203125 0.0625 0.328125q0.078125 0.109375 0.21875 0.1875q0.234375 0.125 0.5625 0.21875q0.328125 0.078125 0.484375 0.140625q0.15625 0.046875 0.390625 0.171875q0.25 0.125 0.359375 0.265625q0.328125 0.34375 0.328125 0.875q0 0.703125 -0.515625 1.109375q-0.5 0.390625 -1.28125 0.390625q-1.140625 0 -1.71875 -0.296875l0 -1.296875l0.84375 -0.0625l0 0.453125q0 0.40625 0.78125 0.40625q0.78125 0 0.78125 -0.5625q0 -0.21875 -0.140625 -0.34375q-0.125 -0.125 -0.265625 -0.171875q-0.125 -0.046875 -0.3125 -0.09375q-0.171875 -0.046875 -0.34375 -0.09375q-0.171875 -0.046875 -0.375 -0.125q-0.203125 -0.078125 -0.4375 -0.234375q-0.484375 -0.3125 -0.484375 -1.03125q0 -0.71875 0.515625 -1.109375q0.515625 -0.390625 1.296875 -0.390625q0.78125 0 1.546875 0.375l0 1.109375l-0.84375 0.0625l0 -0.390625q0 -0.234375 -0.21875 -0.3125zm3.4569702 -0.703125l0 3.90625l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -2.59375q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.328125 -0.015625l0 -0.765625l1.671875 0zm-1.1875 -0.8125q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.5q0.203125 -0.203125 0.515625 -0.203125q0.328125 0 0.515625 0.203125q0.203125 0.203125 0.203125 0.5q0 0.296875 -0.203125 0.484375q-0.1875 0.1875 -0.515625 0.1875q-0.3125 0 -0.515625 -0.1875zm2.839264 4.453125l0 -2.859375l-0.796875 0l0 -0.78125l0.796875 0l0 -0.359375q0 -0.90625 0.421875 -1.3125q0.390625 -0.390625 1.140625 -0.390625q0.765625 0 1.359375 0.328125l-0.28125 0.75q-0.4375 -0.234375 -0.84375 -0.234375q-0.390625 0 -0.546875 0.203125q-0.140625 0.1875 -0.140625 0.578125l0 0.4375l1.390625 0l0 0.78125l-1.390625 0l0 3.125l0.796875 0.03125l0 0.75l-2.484375 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375zm4.5607605 -3.640625l0 3.90625l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.28125 -0.03125 0.28125 -0.296875l0 -2.59375q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.328125 -0.015625l0 -0.765625l1.671875 0zm-1.1875 -0.8125q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.5q0.203125 -0.203125 0.515625 -0.203125q0.328125 0 0.515625 0.203125q0.203125 0.203125 0.203125 0.5q0 0.296875 -0.203125 0.484375q-0.1875 0.1875 -0.515625 0.1875q-0.3125 0 -0.515625 -0.1875zm4.604889 0.671875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.186676 1.65625l0 -2.578125q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.390625 -0.03125l0 -0.75l1.578125 0l0 0.6875q0.15625 -0.359375 0.515625 -0.59375q0.359375 -0.234375 0.8125 -0.234375q0.46875 0 0.890625 0.203125l0 1.390625l-0.875 0.0625l0 -0.421875q0 -0.1875 -0.09375 -0.234375q-0.09375 -0.046875 -0.25 -0.046875q-0.375 0 -0.609375 0.265625q-0.234375 0.265625 -0.234375 0.671875l0 2.15625l0.8125 0.046875l0 0.734375l-2.5 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375z" fill-rule="nonzero"/><path fill="#ff9900" d="m374.3386 123.446304l0 0c0 -2.968773 2.4066467 -5.375435 5.3754272 -5.375435l84.80817 0c1.4256592 0 2.7929382 0.5663376 3.8010254 1.5744247c1.0080872 1.0080872 1.5744324 2.375351 1.5744324 3.8010101l0 21.501099c0 2.9687653 -2.4066772 5.3754272 -5.375458 5.3754272l-84.80817 0c-2.9687805 0 -5.3754272 -2.406662 -5.3754272 -5.3754272z" fill-rule="evenodd"/><path fill="#000000" d="m387.77484 136.5631l0 -4.28125l-0.53125 -0.03125l0 -0.796875l2.265625 0l0 0.75l-0.3125 0.03125q-0.265625 0.03125 -0.265625 0.296875l0 4.296875l0.53125 0.03125l0 0.796875l-2.265625 0l0 -0.75l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375zm2.238739 -2.84375l0 -0.75l1.578125 0l0 0.609375q0.515625 -0.75 1.421875 -0.75q0.921875 0 1.234375 0.75q0.53125 -0.75 1.421875 -0.75q0.671875 0 1.03125 0.40625q0.359375 0.40625 0.359375 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.21875l0 -1.9375q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.53125 -0.25q-0.421875 0 -0.65625 0.265625q-0.234375 0.25 -0.234375 0.6875l0 2.234375l0.515625 0.03125l0 0.75l-2.203125 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -1.921875q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.546875 -0.25q-0.421875 0 -0.65625 0.265625q-0.21875 0.265625 -0.21875 0.671875l0 2.25l0.5 0.03125l0 0.75l-2.1875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 0l0 -2.578125q0 -0.15625 -0.046875 -0.21875q-0.046875 -0.0625 -0.1875 -0.0625l-0.390625 -0.03125zm12.3281555 -0.5625l0 3.46875q0 0.140625 0.046875 0.203125q0.046875 0.046875 0.1875 0.0625l0.296875 0.015625l0 0.75l-1.484375 0l0 -0.546875l-0.03125 0q-0.46875 0.671875 -1.28125 0.671875q-0.953125 0 -1.40625 -0.609375q-0.453125 -0.609375 -0.453125 -1.65625q0 -1.28125 0.609375 -1.984375q0.625 -0.703125 1.84375 -0.703125q0.796875 0 1.671875 0.328125zm-1.078125 3.015625l0 -2.421875q-0.265625 -0.109375 -0.71875 -0.109375q-0.625 0 -0.90625 0.5q-0.28125 0.5 -0.28125 1.328125q0 1.515625 0.96875 1.515625q0.40625 0 0.671875 -0.25q0.265625 -0.25 0.265625 -0.5625zm6.3203125 1.46875q-0.078125 1.046875 -0.59375 1.53125q-0.515625 0.484375 -1.609375 0.484375q-1.078125 0 -1.703125 -0.3125l0 -1.296875l0.890625 -0.078125l0 0.453125q0 0.25 0.234375 0.328125q0.234375 0.09375 0.640625 0.09375q0.59375 0 0.828125 -0.359375q0.25 -0.359375 0.25 -1.0l0 -0.515625q-0.4375 0.515625 -1.234375 0.515625q-0.484375 0 -0.859375 -0.1875q-0.359375 -0.1875 -0.5625 -0.5q-0.40625 -0.640625 -0.40625 -1.484375q0 -1.234375 0.65625 -1.859375q0.671875 -0.625 1.765625 -0.625q0.921875 0 1.734375 0.34375l0 3.453125q0 0.796875 -0.03125 1.015625zm-1.0625 -1.890625l0 -2.0q-0.328125 -0.109375 -0.734375 -0.109375q-0.578125 0 -0.890625 0.421875q-0.296875 0.40625 -0.296875 1.125q0 1.484375 0.96875 1.484375q0.40625 0 0.671875 -0.25q0.28125 -0.25 0.28125 -0.671875zm4.3401184 -2.921875q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm4.5460815 1.859375q1.125 0 1.125 -0.890625q0 -0.46875 -0.5 -0.6875q-0.265625 -0.140625 -0.6875 -0.25q-0.40625 -0.109375 -0.6875 -0.21875q-0.265625 -0.125 -0.546875 -0.328125q-0.546875 -0.40625 -0.546875 -1.25q0 -0.859375 0.578125 -1.34375q0.59375 -0.5 1.546875 -0.5q0.96875 0 1.78125 0.375l0 1.40625l-0.9375 0.0625l0 -0.546875q0 -0.140625 -0.046875 -0.203125q-0.09375 -0.171875 -0.65625 -0.171875q-0.546875 0 -0.828125 0.203125q-0.265625 0.1875 -0.265625 0.59375q0 0.296875 0.234375 0.53125q0.140625 0.140625 0.4375 0.25q0.296875 0.09375 0.625 0.203125q0.328125 0.09375 0.578125 0.21875q0.265625 0.109375 0.546875 0.328125q0.578125 0.421875 0.578125 1.296875q0 0.875 -0.640625 1.390625q-0.625 0.5 -1.65625 0.5q-1.03125 0 -1.859375 -0.390625l0 -1.46875l0.953125 -0.0625l0 0.5625q0 0.140625 0.03125 0.21875q0.046875 0.0625 0.234375 0.125q0.203125 0.046875 0.609375 0.046875zm5.2755127 -3.984375q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm6.7492065 2.6875q-0.078125 1.046875 -0.59375 1.53125q-0.515625 0.484375 -1.609375 0.484375q-1.078125 0 -1.703125 -0.3125l0 -1.296875l0.890625 -0.078125l0 0.453125q0 0.25 0.234375 0.328125q0.234375 0.09375 0.640625 0.09375q0.59375 0 0.828125 -0.359375q0.25 -0.359375 0.25 -1.0l0 -0.515625q-0.4375 0.515625 -1.234375 0.515625q-0.484375 0 -0.859375 -0.1875q-0.359375 -0.1875 -0.5625 -0.5q-0.40625 -0.640625 -0.40625 -1.484375q0 -1.234375 0.65625 -1.859375q0.671875 -0.625 1.765625 -0.625q0.921875 0 1.734375 0.34375l0 3.453125q0 0.796875 -0.03125 1.015625zm-1.0625 -1.890625l0 -2.0q-0.328125 -0.109375 -0.734375 -0.109375q-0.578125 0 -0.890625 0.421875q-0.296875 0.40625 -0.296875 1.125q0 1.484375 0.96875 1.484375q0.40625 0 0.671875 -0.25q0.28125 -0.25 0.28125 -0.671875zm1.9338684 -2.03125l0 -0.75l1.578125 0l0 0.609375q0.515625 -0.75 1.421875 -0.75q0.921875 0 1.234375 0.75q0.53125 -0.75 1.421875 -0.75q0.671875 0 1.03125 0.40625q0.359375 0.40625 0.359375 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.21875l0 -1.9375q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.53125 -0.25q-0.421875 0 -0.65625 0.265625q-0.234375 0.25 -0.234375 0.6875l0 2.234375l0.515625 0.03125l0 0.75l-2.203125 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -1.921875q0 -0.515625 -0.125 -0.75q-0.109375 -0.25 -0.546875 -0.25q-0.421875 0 -0.65625 0.265625q-0.21875 0.265625 -0.21875 0.671875l0 2.25l0.5 0.03125l0 0.75l-2.1875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 0l0 -2.578125q0 -0.15625 -0.046875 -0.21875q-0.046875 -0.0625 -0.1875 -0.0625l-0.390625 -0.03125zm10.4844055 -0.890625q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm2.5460815 -1.25l0 -0.734375l1.578125 0l0 0.609375q0.25 -0.359375 0.640625 -0.546875q0.40625 -0.203125 0.875 -0.203125q0.703125 0 1.09375 0.40625q0.390625 0.40625 0.390625 1.21875l0 2.421875l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375l0 -1.921875q0 -0.515625 -0.171875 -0.75q-0.15625 -0.25 -0.59375 -0.25q-0.4375 0 -0.703125 0.28125q-0.265625 0.265625 -0.265625 0.65625l0 2.25l0.53125 0.03125l0 0.75l-2.21875 0l0 -0.703125l0.3125 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.0625 -0.0625 0.0625 -0.234375l0 -2.578125q0 -0.28125 -0.234375 -0.296875l-0.390625 -0.03125zm6.3172913 -1.734375l0.609375 0l0 1.0l1.234375 0l-0.09375 0.78125l-1.140625 0l0 2.453125q0 0.40625 0.140625 0.578125q0.15625 0.15625 0.46875 0.15625q0.3125 0 0.625 -0.203125l0.28125 0.71875q-0.46875 0.328125 -1.1875 0.328125q-0.421875 0 -0.71875 -0.109375q-0.28125 -0.09375 -0.421875 -0.21875q-0.140625 -0.140625 -0.21875 -0.390625q-0.0625 -0.25 -0.078125 -0.390625q0 -0.15625 0 -0.4375l0 -2.484375l-0.671875 0l0.09375 -0.6875q0.4375 -0.03125 0.65625 -0.296875q0.234375 -0.28125 0.421875 -0.796875zm4.829193 0.859375q0.6875 0 1.09375 0.34375q0.421875 0.328125 0.421875 0.953125q0 0.40625 -0.1875 0.71875q-0.171875 0.3125 -0.4375 0.5q-0.25 0.171875 -0.625 0.28125q-0.625 0.1875 -1.40625 0.1875q0.03125 0.484375 0.296875 0.796875q0.28125 0.296875 0.859375 0.296875q0.578125 0 1.15625 -0.40625l0.359375 0.75q-0.1875 0.171875 -0.640625 0.359375q-0.4375 0.171875 -1.0 0.171875q-1.140625 0 -1.671875 -0.625q-0.53125 -0.640625 -0.53125 -1.734375q0 -1.109375 0.609375 -1.84375q0.609375 -0.75 1.703125 -0.75zm-0.4375 2.125q0.34375 -0.0625 0.625 -0.265625q0.296875 -0.21875 0.296875 -0.5q0 -0.578125 -0.5625 -0.578125q-0.515625 0 -0.796875 0.421875q-0.28125 0.421875 -0.3125 1.0q0.421875 -0.015625 0.75 -0.078125zm3.1867065 1.65625l0 -2.578125q0 -0.15625 -0.0625 -0.21875q-0.046875 -0.0625 -0.171875 -0.0625l-0.390625 -0.03125l0 -0.75l1.578125 0l0 0.6875q0.15625 -0.359375 0.515625 -0.59375q0.359375 -0.234375 0.8125 -0.234375q0.46875 0 0.890625 0.203125l0 1.390625l-0.875 0.0625l0 -0.421875q0 -0.1875 -0.09375 -0.234375q-0.09375 -0.046875 -0.25 -0.046875q-0.375 0 -0.609375 0.265625q-0.234375 0.265625 -0.234375 0.671875l0 2.15625l0.8125 0.046875l0 0.734375l-2.5 0l0 -0.703125l0.296875 -0.03125q0.140625 -0.015625 0.203125 -0.078125q0.078125 -0.0625 0.078125 -0.234375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m421.7318 92.78451l0.3779602 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.73184 92.78451l0.28829956 19.292007" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m420.3686 112.1012l1.7193604 4.5129166l1.5837402 -4.562271z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m421.7318 92.78451l-95.90549 25.291336" fill-rule="evenodd"/><path stroke="#ff9900" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.73184 92.78451l-90.10388 23.761383" fill-rule="evenodd"/><path fill="#ff9900" stroke="#ff9900" stroke-width="1.0" stroke-linecap="butt" d="m331.2068 114.94876l-3.9668884 2.7543106l4.809265 0.43994904z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/segmentation-output.png b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/segmentation-output.png
new file mode 100644
index 00000000000..e871df337f2
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/segmentation-output.png differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/sparrow.jpg b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/sparrow.jpg
new file mode 100644
index 00000000000..25d213ea406
Binary files /dev/null and b/tensorflow/lite/g3doc/inference_with_metadata/task_library/images/sparrow.jpg differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
new file mode 100644
index 00000000000..cfbf36e1332
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
@@ -0,0 +1,151 @@
+# Natural language classifier
+
+The Task Library's `NLClassifier` API classifies input text into different
+categories, and is a versatile and configurable API that can handle most text
+classification models.
+
+## Key features of the NLClassifier API
+
+*   Takes a single string as input, performs classification with the string and
+    outputs <Label, Score> pairs as classification results.
+
+*   Optional Regex Tokenization available for input text.
+
+*   Configurable to adapt different classification models.
+
+## Supported NLClassifier models
+
+The following models are guaranteed to be compatible with the `NLClassifier`
+API.
+
+*   The <a href="../../models/text_classification/overview.md">movie review
+    sentiment classification</a> model.
+
+*   Models with `average_word_vec` spec created by
+    [TensorFlow Lite Model Maker for text Classfication](https://www.tensorflow.org/lite/tutorials/model_maker_text_classification).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Text Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+### Step 2: Run inference using the API
+
+```java
+// Initialization, use NLClassifierOptions to configure input and output tensors
+NLClassifierOptions options = NLClassifierOptions.builder().setInputTensorName(INPUT_TENSOR_NAME).setOutputScoreTensorName(OUTPUT_SCORE_TENSOR_NAME).build();
+NLClassifier classifier = NLClassifier.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Category> results = classifier.classify(input);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/nlclassifier/NLClassifier.java)
+for more options to configure `NLClassifier`.
+
+## Run inference in C++
+
+Note: We are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+std::unique_ptr<NLClassifier> classifier = NLClassifier::CreateFromFileAndOptions(
+    model_path,
+    {
+      .input_tensor_name=kInputTensorName,
+      .output_score_tensor_name=kOutputScoreTensorName,
+    }).value();
+
+// Run inference
+std::vector<core::Category> categories = classifier->Classify(kInput);
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/text/nlclassifier/nl_classifier.h)
+for more details.
+
+## Example results
+
+Here is an example of the classification results of the
+[movie review model](https://www.tensorflow.org/lite/models/text_classification/overview).
+
+Input: "What a waste of my time."
+
+Output:
+
+```
+category[0]: 'Negative' : '0.81313'
+category[1]: 'Positive' : '0.18687'
+```
+
+Try out the simple
+[CLI demo tool for NLClassifier](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/examples/task/text/desktop/README.md#nlclassifier)
+with your own model and test data.
+
+## Model compatibility requirements
+
+Depending on the use case, the `NLClassifier` API can load a TFLite model with
+or without [TFLite Model Metadata](../../convert/metadata.md).
+
+The compatible models should meet the following requirements:
+
+*   Input tensor: (kTfLiteString/kTfLiteInt32)
+
+    -   Input of the model should be either a kTfLiteString tensor raw input
+        string or a kTfLiteInt32 tensor for regex tokenized indices of raw input
+        string.
+    -   If input type is kTfLiteString, no [Metadata](../../convert/metadata.md)
+        is required for the model.
+    -   If input type is kTfLiteInt32, a `RegexTokenizer` needs to be set up in
+        the input tensor's [Metadata](../../convert/metadata.md).
+
+*   Output score tensor:
+    (kTfLiteUInt8/kTfLiteInt8/kTfLiteInt16/kTfLiteFloat32/kTfLiteFloat64)
+
+    -   Mandatory output tensor for the score of each category classified.
+
+    -   If type is one of the Int types, dequantize it to double/float to
+        corresponding platforms
+
+    -   Can have an optional associated file in the output tensor's
+        corresponding [Metadata](../../convert/metadata.md) for category labels,
+        the file should be a plain text file with one label per line, and the
+        number of labels should match the number of categories as the model
+        outputs.
+
+*   Output label tensor: (kTfLiteString/kTfLiteInt32)
+
+    -   Optional output tensor for the label for each category, should be of the
+        same length as the output score tensor. If this tensor is not present,
+        the API uses score indices as classnames.
+
+    -   Will be ignored if the associated label file is present in output score
+        tensor's Metadata.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
new file mode 100644
index 00000000000..06b08e16b9b
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
@@ -0,0 +1,179 @@
+# Integrate object detectors
+
+Object detectors can identify which of a known set of objects might be present
+and provide information about their positions within the given image or a video
+stream. An object detector is trained to detect the presence and location of
+multiple classes of objects. For example, a model might be trained with images
+that contain various pieces of fruit, along with a _label_ that specifies the
+class of fruit they represent (e.g. an apple, a banana, or a strawberry), and
+data specifying where each object appears in the image. See the
+[introduction of object detection](../../models/object_detection/overview.md)
+for more information about object detectors.
+
+Use the Task Library `ObjectDetector` API to deploy your custom object detectors
+or pretrained ones into your model apps.
+
+## Key features of the ObjectDetector API
+
+*   Input image processing, including rotation, resizing, and color space
+    conversion.
+
+*   Label map locale.
+
+*   Score threshold to filter results.
+
+*   Top-k detection results.
+
+*   Label allowlist and denylist.
+
+## Supported object detector models
+
+The following models are guaranteed to be compatible with the `ObjectDetector`
+API.
+
+*   The
+    [pretrained object detection models on TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection&publisher=google,tensorflow).
+
+*   Models created by
+    [AutoML Vision Edge Object Detection](https://cloud.google.com/vision/automl/object-detection/docs).
+
+*   Custom models that meet the
+    [model compatibility requirements](#model-compatibility-requirements).
+
+## Run inference in Java
+
+### Step 1: Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import the Task Vision Library dependency
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+}
+```
+
+### Step 2: Using the model
+
+```java
+// Initialization
+ObjectDetectorOptions options = ObjectDetectorOptions.builder().setMaxResults(1).build();
+ObjectDetector objectDetector = ObjectDetector.createFromFileAndOptions(context, modelFile, options);
+
+// Run inference
+List<Detection> results = objectDetector.detect(image);
+```
+
+See the
+[source code and javadoc](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/vision/detector/ObjectDetector.java)
+for more options to configure `ObjectDetector`.
+
+## Run inference in C++
+
+Note: we are working on improving the usability of the C++ Task Library, such as
+providing prebuilt binaries and creating user-friendly workflows to build from
+source code. The C++ API may be subject to change.
+
+```c++
+// Initialization
+ObjectDetectorOptions options;
+options.mutable_model_file_with_metadata()->set_file_name(model_file);
+std::unique_ptr<ObjectDetector> object_detector = ObjectDetector::CreateFromOptions(options).value();
+
+// Run inference
+const DetectionResult result = object_detector->Detect(*frame_buffer).value();
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/cc/task/vision/object_detector.h)
+for more options to configure `ObjectDetector`.
+
+## Example results
+
+Here is an example of the detection results of
+[ssd mobilenet v1](https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1)
+from TensorFlow Hub.
+
+<img src="images/dogs.jpg" alt="dogs" width="50%">
+
+```
+Results:
+ Detection #0 (red):
+  Box: (x: 355, y: 133, w: 190, h: 206)
+  Top-1 class:
+   index       : 17
+   score       : 0.73828
+   class name  : dog
+ Detection #1 (green):
+  Box: (x: 103, y: 15, w: 138, h: 369)
+  Top-1 class:
+   index       : 17
+   score       : 0.73047
+   class name  : dog
+```
+
+Render the bounding boxes onto the input image:
+
+<img src="images/detection-output.png" alt="detection output" width="50%">
+
+Try out the simple
+[CLI demo tool for ObjectDetector](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#object-detector)
+with your own model and test data.
+
+## Model compatibility requirements
+
+The `ObjectDetector` API expects a TFLite model with mandatory
+[TFLite Model Metadata](../../convert/metadata.md).
+
+The compatible object detector models should meet the following requirements:
+
+*   Input image tensor: (kTfLiteUInt8/kTfLiteFloat32)
+
+    -   image input of size `[batch x height x width x channels]`.
+    -   batch inference is not supported (`batch` is required to be 1).
+    -   only RGB inputs are supported (`channels` is required to be 3).
+    -   if type is kTfLiteFloat32, NormalizationOptions are required to be
+        attached to the metadata for input normalization.
+
+*   Output tensors must be the 4 outputs of a `DetectionPostProcess` op, i.e:
+
+    -   Locations tensor (kTfLiteFloat32)
+        -   tensor of size `[1 x num_results x 4]`, the inner array representing
+            bounding boxes in the form [top, left, right, bottom].
+        -   BoundingBoxProperties are required to be attached to the metadata
+            and must specify `type=BOUNDARIES` and `coordinate_type=RATIO.
+    -   Classes tensor (kTfLiteFloat32)
+
+        -   tensor of size `[1 x num_results]`, each value representing the
+            integer index of a class.
+        -   optional (but recommended) label map(s) can be attached as
+            AssociatedFile-s with type TENSOR_VALUE_LABELS, containing one label
+            per line. The first such AssociatedFile (if any) is used to fill the
+            `class_name` field of the results. The `display_name` field is
+            filled from the AssociatedFile (if any) whose locale matches the
+            `display_names_locale` field of the `ObjectDetectorOptions` used at
+            creation time ("en" by default, i.e. English). If none of these are
+            available, only the `index` field of the results will be filled.
+
+    -   Scores tensor (kTfLiteFloat32)
+
+        -   tensor of size `[1 x num_results]`, each value representing the
+            score of the detected object.
+
+    -   Number of detection tensor (kTfLiteFloat32)
+
+        -   integer num_results as a tensor of size `[1]`.
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
new file mode 100644
index 00000000000..94b8f089a10
--- /dev/null
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
@@ -0,0 +1,51 @@
+# TensorFlow Lite Task Library
+
+TensorFlow Lite Task Library contains a set of powerful and easy-to-use
+task-specific libraries for app developers to create ML experiences with TFLite.
+It provides optimized out-of-box model interfaces for popular machine learning
+tasks, such as image classification, question and answer, etc. The model
+interfaces are specifically designed for each task to achieve the best
+performance and usability. Task Library works cross-platform and is supported on
+Java, C++, and Swift(coming soon).
+
+## What to expect from the Task Library
+
+*   **Clean and well-defined APIs usable by non-ML-experts** \
+    Inference can be done within just 5 lines of code. Use the powerful and
+    easy-to-use APIs in the Task library as building blocks to help you easily
+    develop ML with TFLite on mobile devices.
+
+*   **Complex but common data processing** \
+    Supports common vision and natural language processing logic to convert
+    between your data and the data format required by the model. Provides the
+    same, shareable processing logic for training and inference.
+
+*   **High performance gain** \
+    Data processing would take no more than a few milliseconds, ensuring the
+    fast inference experience using TensorFlow Lite.
+
+*   **Extensibility and customization** \
+    You can leverage all benefits the Task Library infrastructure provides and
+    easily build your own Android/iOS inference APIs.
+
+## Supported tasks
+
+Below is the list of the supported task types. The list is expected to grow as
+we continue enabling more and more use cases.
+
+*   **Vision APIs**
+
+    *   [ImageClassifier](image_classifier.md)
+    *   [ObjectDetector](object_detector.md)
+    *   [ImageSegmenter](image_segmenter.md)
+
+*   **Natural Language (NL) APIs**
+
+    *   [NLClassifier](nl_classifier.md)
+    *   [BertNLCLassifier](bert_nl_classifier.md)
+    *   [BertQuestionAnswerer](bert_question_answerer.md)
+
+*   **Custom APIs**
+
+    *   Extend Task API infrastructure and build
+        [customized API](customized_task_api.md).
diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md
deleted file mode 100644
index 7b1eb5c9919..00000000000
--- a/tensorflow/lite/g3doc/performance/benchmarks.md
+++ /dev/null
@@ -1,204 +0,0 @@
-# Performance benchmarks
-
-This document lists TensorFlow Lite performance benchmarks when running well
-known models on some Android and iOS devices.
-
-These performance benchmark numbers were generated with the
-[Android TFLite benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
-and the [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
-
-## Android performance benchmarks
-
-For Android benchmarks, the CPU affinity is set to use big cores on the device to
-reduce variance (see [details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
-
-It assumes that models were download and unzipped to the
-`/data/local/tmp/tflite_models` directory. The benchmark binary is built
-using [these instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#on-android)
-and assumed in the `/data/local/tmp` directory.
-
-To run the benchmark:
-
-```sh
-adb shell /data/local/tmp/benchmark_model \
-  --num_threads=4 \
-  --graph=/data/local/tmp/tflite_models/${GRAPH} \
-  --warmup_runs=1 \
-  --num_runs=50
-```
-
-To run with nnapi delegate, please set `--use_nnapi=true`. To run with gpu
-delegate, please set `--use_gpu=true`.
-
-The performance values below are measured on Android 10.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>CPU, 4 threads</th>
-      <th>GPU</th>
-      <th>NNAPI</th>
-    </tr>
-  </thead>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>23.9 ms</td>
-    <td>6.45 ms</td>
-    <td>13.8 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>14.0 ms</td>
-    <td>9.0 ms</td>
-    <td>14.8 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>13.4 ms</td>
-    <td>--- </td>
-    <td>6.0 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>5.0 ms</td>
-    <td>--- </td>
-    <td>3.2 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>56 ms</td>
-    <td>--- </td>
-    <td>102 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>34.5 ms</td>
-    <td>--- </td>
-    <td>99.0 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>35.8 ms</td>
-    <td>9.5 ms </td>
-    <td>18.5 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>23.9 ms</td>
-    <td>11.1 ms</td>
-    <td>19.0 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>422 ms</td>
-    <td>99.8 ms </td>
-    <td>201 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>272.6 ms</td>
-    <td>87.2 ms</td>
-    <td>171.1 ms</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>Pixel 3 </td>
-    <td>486 ms</td>
-    <td>93 ms </td>
-    <td>292 ms</td>
-  </tr>
-   <tr>
-     <td>Pixel 4 </td>
-    <td>324.1 ms</td>
-    <td>97.6 ms</td>
-    <td>186.9 ms</td>
-  </tr>
-
- </table>
-
-## iOS benchmarks
-
-To run iOS benchmarks, the
-[benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios)
-was modified to include the appropriate model and `benchmark_params.json` was
-modified to set `num_threads` to 2. For GPU delegate, `"use_gpu" : "1"` and
-`"gpu_wait_type" : "aggressive"` options were also added to
-`benchmark_params.json`.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>CPU, 2 threads</th>
-      <th>GPU</th>
-    </tr>
-  </thead>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>14.8 ms</td>
-    <td>3.4 ms</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>11 ms</td>
-    <td>---</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>30.4 ms</td>
-    <td>---</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>21.1 ms</td>
-    <td>15.5 ms</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>261.1 ms</td>
-    <td>45.7 ms</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>iPhone XS </td>
-    <td>309 ms</td>
-    <td>54.4 ms</td>
-  </tr>
- </table>
diff --git a/tensorflow/lite/g3doc/performance/images/as_select_profiling_mode.png b/tensorflow/lite/g3doc/performance/images/as_select_profiling_mode.png
new file mode 100644
index 00000000000..9ba5ba89355
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/as_select_profiling_mode.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/as_traces.png b/tensorflow/lite/g3doc/performance/images/as_traces.png
new file mode 100644
index 00000000000..cbc2b14b8e9
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/as_traces.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/perfetto_traces.png b/tensorflow/lite/g3doc/performance/images/perfetto_traces.png
new file mode 100644
index 00000000000..94b2654217b
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/perfetto_traces.png differ
diff --git a/tensorflow/lite/g3doc/performance/measurement.md b/tensorflow/lite/g3doc/performance/measurement.md
new file mode 100644
index 00000000000..179406f517e
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/measurement.md
@@ -0,0 +1,505 @@
+# Performance measurement
+
+## Benchmark tools
+
+TensorFlow Lite benchmark tools currently measure and calculate statistics for
+the following important performance metrics:
+
+*   Initialization time
+*   Inference time of warmup state
+*   Inference time of steady state
+*   Memory usage during initialization time
+*   Overall memory usage
+
+The benchmark tools are available as benchmark apps for Android and iOS and as
+native command-line binaries, and they all share the same core performance
+measurement logic. Note that the available options and output formats are
+slightly different due to the differences in runtime environment.
+
+### Android benchmark app
+
+There are two options of using the benchmark tool with Android. One is a
+[native benchmark binary](#native-benchmark-binary) and another is an Android
+benchmark app, a better gauge of how the model would perform in the app. Either
+way, the numbers from the benchmark tool will still differ slightly from when
+running inference with the model in the actual app.
+
+This Android benchmark app has no UI. Install and run it by using the `adb`
+command and retrieve results by using the `adb logcat` command.
+
+#### Download or build the app
+
+Download the nightly pre-built Android benchmark apps using the links below:
+
+*   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model.apk)
+
+*   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model.apk)
+
+You can also build the app from source by following these
+[instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/android).
+
+Note: It is required to build the app from the source if you want to run the
+Android benchmark apk on x86 CPU or Hexagon delegate or if your model contains
+[select TF operators](../guide/ops_select) or
+[custom operators](../guide/ops_custom).
+
+#### Prepare benchmark
+
+Before running the benchmark app, install the app and push the model file to the
+device as follows:
+
+```shell
+adb install -r -d -g android_aarch64_benchmark_model.apk
+adb push your_model.tflite /data/local/tmp
+```
+
+#### Run benchmark
+
+```shell
+adb shell am start -S \
+  -n org.tensorflow.lite.benchmark/.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/your_model.tflite \
+              --num_threads=4"'
+```
+
+`graph` is a required parameter.
+
+*   `graph`: `string` \
+    The path to the TFLite model file.
+
+You can specify more optional parameters for running the benchmark.
+
+*   `num_threads`: `int` (default=1) \
+    The number of threads to use for running TFLite interpreter.
+*   `use_gpu`: `bool` (default=false) \
+    Use [GPU delegate](gpu).
+*   `use_nnapi`: `bool` (default=false) \
+    Use [NNAPI delegate](nnapi).
+*   `use_xnnpack`: `bool` (default=`false`) \
+    Use
+    [XNNPACK delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/xnnpack).
+*   `use_hexagon`: `bool` (default=`false`) \
+    Use [Hexagon delegate](hexagon_delegate).
+
+Depending on the device you are using, some of these options may not be
+available or have no effect. Refer to
+[parameters](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#parameters)
+for more performance parameters that you could run with the benchmark app.
+
+View the results using the `logcat` command:
+
+```shell
+adb logcat | grep "Average inference"
+```
+
+The benchmark results are reported as:
+
+```
+... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
+```
+
+### Native benchmark binary
+
+Benchmark tool is also provided as a native binary `benchmark_model`. You can
+execute this tool from a shell command line on Linux, Mac, embedded devices and
+Android devices.
+
+#### Download or build the binary
+
+Download the nightly pre-built native command-line binaries by following the
+links below:
+
+*   [linux_x86-64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_x86-64_benchmark_model)
+*   [linux_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_aarch64_benchmark_model)
+*   [linux_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_arm_benchmark_model)
+*   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model)
+*   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model)
+
+You can also build the native benchmark binary from
+[source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+on your computer.
+
+```shell
+bazel build -c opt //tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+To build with Android NDK toolchain, you need to set up the build environment
+first by following this
+[guide](../guide/build_android#set_up_build_environment_without_docker), or use
+the docker image as described in this
+[guide](../guide/build_android#set_up_build_environment_using_docker).
+
+```shell
+bazel build -c opt --config=android_arm64 \
+  //tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+Note: It is a valid approach to push and execute binaries directly on an Android
+device for benchmarking, but it can result in subtle (but observable)
+differences in performance relative to execution within an actual Android app.
+In particular, Android's scheduler tailors behavior based on thread and process
+priorities, which differ between a foreground Activity/Application and a regular
+background binary executed via `adb shell ...`. This tailored behavior is most
+evident when enabling multi-threaded CPU execution with TensorFlow Lite.
+Therefore, the Android benchmark app is preferred for performance measurement.
+
+#### Run benchmark
+
+To run benchmarks on your computer, execute the binary from the shell.
+
+```shell
+path/to/downloaded_or_built/benchmark_model \
+  --graph=your_model.tflite \
+  --num_threads=4
+```
+
+You can use the same set of
+[parameters](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#parameters)
+as mentioned above with the native command-line binary.
+
+#### Profiling model ops
+
+The benchmark model binary also allows you to profile model ops and get the
+execution times of each operator. To do this, pass the flag
+`--enable_op_profiling=true` to `benchmark_model` during invocation. Details are
+explained
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#profiling-model-operators).
+
+### Native benchmark binary for multiple performance options in a single run
+
+A convenient and simple C++ binary is also provided to
+[benchmark multiple performance options](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#benchmark-multiple-performance-options-in-a-single-run)
+in a single run. This binary is built based on the aforementioned benchmark tool
+that could only benchmark a single performance option at a time. They share the
+same build/install/run process, but the BUILD target name of this binary is
+`benchmark_model_performance_options` and it takes some additional parameters.
+An important parameter for this binary is:
+
+`perf_options_list`: `string` (default='all') \
+A comma-separated list of TFLite performance options to benchmark.
+
+You can get nightly pre-built binaries for this tool as listed below:
+
+*   [linux_x86-64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_x86-64_benchmark_model_performance_options)
+*   [linux_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_aarch64_benchmark_model_performance_options)
+*   [linux_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/linux_arm_benchmark_model_performance_options)
+*   [android_aarch64](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_aarch64_benchmark_model_performance_options)
+*   [android_arm](https://storage.googleapis.com/tensorflow-nightly-public/prod/tensorflow/release/lite/tools/nightly/latest/android_arm_benchmark_model_performance_options)
+
+### iOS benchamark app
+
+To run benchmarks on iOS device, you need to build the app from
+[source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
+Put the TensorFlow Lite model file in the
+[benchmark_data](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/benchmark_data)
+directory of the source tree and modify the `benchmark_params.json` file. Those
+files are packaged into the app and the app reads data from the directory. Visit
+the
+[iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios)
+for detailed instructions.
+
+## Performance benchmarks for well known models
+
+This section lists TensorFlow Lite performance benchmarks when running well
+known models on some Android and iOS devices.
+
+### Android performance benchmarks
+
+These performance benchmark numbers were generated with the
+[native benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
+
+For Android benchmarks, the CPU affinity is set to use big cores on the device
+to reduce variance (see
+[details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
+
+It assumes that models were downloaded and unzipped to the
+`/data/local/tmp/tflite_models` directory. The benchmark binary is built using
+[these instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#on-android)
+and assumed to be in the `/data/local/tmp` directory.
+
+To run the benchmark:
+
+```sh
+adb shell /data/local/tmp/benchmark_model \
+  --num_threads=4 \
+  --graph=/data/local/tmp/tflite_models/${GRAPH} \
+  --warmup_runs=1 \
+  --num_runs=50
+```
+
+To run with nnapi delegate, set `--use_nnapi=true`. To run with GPU delegate,
+set `--use_gpu=true`.
+
+The performance values below are measured on Android 10.
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Device </th>
+      <th>CPU, 4 threads</th>
+      <th>GPU</th>
+      <th>NNAPI</th>
+    </tr>
+  </thead>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>23.9 ms</td>
+    <td>6.45 ms</td>
+    <td>13.8 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>14.0 ms</td>
+    <td>9.0 ms</td>
+    <td>14.8 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>13.4 ms</td>
+    <td>--- </td>
+    <td>6.0 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>5.0 ms</td>
+    <td>--- </td>
+    <td>3.2 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>56 ms</td>
+    <td>--- </td>
+    <td>102 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>34.5 ms</td>
+    <td>--- </td>
+    <td>99.0 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>35.8 ms</td>
+    <td>9.5 ms </td>
+    <td>18.5 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>23.9 ms</td>
+    <td>11.1 ms</td>
+    <td>19.0 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>422 ms</td>
+    <td>99.8 ms </td>
+    <td>201 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>272.6 ms</td>
+    <td>87.2 ms</td>
+    <td>171.1 ms</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
+    </td>
+    <td>Pixel 3 </td>
+    <td>486 ms</td>
+    <td>93 ms </td>
+    <td>292 ms</td>
+  </tr>
+   <tr>
+     <td>Pixel 4 </td>
+    <td>324.1 ms</td>
+    <td>97.6 ms</td>
+    <td>186.9 ms</td>
+  </tr>
+
+ </table>
+
+### iOS performance benchmarks
+
+These performance benchmark numbers were generated with the
+[iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
+
+To run iOS benchmarks, the benchmark app was modified to include the appropriate
+model and `benchmark_params.json` was modified to set `num_threads` to 2. To use
+the GPU delegate, `"use_gpu" : "1"` and `"gpu_wait_type" : "aggressive"` options
+were also added to `benchmark_params.json`.
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Device </th>
+      <th>CPU, 2 threads</th>
+      <th>GPU</th>
+    </tr>
+  </thead>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>14.8 ms</td>
+    <td>3.4 ms</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>11 ms</td>
+    <td>---</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>30.4 ms</td>
+    <td>---</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>21.1 ms</td>
+    <td>15.5 ms</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>261.1 ms</td>
+    <td>45.7 ms</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
+    </td>
+    <td>iPhone XS </td>
+    <td>309 ms</td>
+    <td>54.4 ms</td>
+  </tr>
+ </table>
+
+## Trace TensorFlow Lite internals in Android
+
+Note: This feature is experimental and available only when the Android app is
+built with the nightly released Tensorflow Lite library. Stable libraries up to
+v2.3 do not support this.
+
+Internal events from the TensorFlow Lite interpreter of an Android app can be
+captured by
+[Android tracing tools](https://developer.android.com/topic/performance/tracing).
+It is the same event with Android
+[Trace](https://developer.android.com/reference/android/os/Trace) API, so the
+captured events from Java/Kotlin code are seen together with TensorFlow Lite
+internal events.
+
+Some examples of events are:
+
+*   Operator invocation
+*   Graph modification by deleagate
+*   Tensor allocation
+
+Among different options for capturing traces, this guide covers the Android
+Studio CPU Profiler and the System Tracing app. Refer to
+[Perfetto command-line tool](https://developer.android.com/studio/command-line/perfetto)
+or
+[Systrace command-line tool](https://developer.android.com/topic/performance/tracing/command-line)
+for other options.
+
+### Adding trace events in Java code
+
+This is a code snippet from the
+[Image Classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android)
+example app. TensorFlow Lite interpreter runs in the
+`recognizeImage/runInference` section. This step is optional but it is useful to
+help notice where the inference call is made.
+
+```java
+  Trace.beginSection("recognizeImage");
+  ...
+  // Runs the inference call.
+  Trace.beginSection("runInference");
+  tflite.run(inputImageBuffer.getBuffer(), outputProbabilityBuffer.getBuffer().rewind());
+  Trace.endSection();
+  ...
+  Trace.endSection();
+
+```
+
+### Android Studio CPU Profiler
+
+Capture traces with the
+[Android Studio CPU Profiler](https://developer.android.com/studio/profile/cpu-profiler)
+by following the steps below:
+
+1.  Select **Run > Profile 'app'** from the top menus.
+
+2.  Click anywhere in CPU timeline when the Profiler window appears.
+
+3.  Select 'Trace System Calls' among CPU Profiling modes.
+
+    ![Select 'Trace System Calls'](images/as_select_profiling_mode.png)
+
+4.  Press 'Record' button.
+
+5.  Press 'Stop' button.
+
+6.  Investigate the trace result.
+
+    ![Android Studio trace](images/as_traces.png)
+
+In this example, you can see the hierarchy of events in a thread and statistics
+for each operator time and also see the data flow of the whole app among
+threads.
+
+### System Tracing app
+
+Capture traces without Android Studio by following the steps detailed in
+[System Tracing app](https://developer.android.com/topic/performance/tracing/on-device).
+
+In this example, the same TFLite events were captured and saved to the Perfetto
+or Systrace format depending on the version of Android device. The captured
+trace files can be opened in the [Perfetto UI](https://ui.perfetto.dev/#!/).
+
+![Perfetto trace](images/perfetto_traces.png)
+
+### Using the tracing data
+
+The tracing data allows you to identify performance bottlenecks.
+
+Here are some examples of insights that you can get from the profiler and
+potential solutions to improve performance:
+
+*   If the number of available CPU cores is smaller than the number of inference
+    threads, then the CPU scheduling overhead can lead to subpar performance.
+    You can reschedule other CPU intensive tasks in your application to avoid
+    overlapping with your model inference or tweak the number of interpreter
+    threads.
+*   If the operators are not fully delegated, then some parts of the model graph
+    are executed on the CPU rather than the expected hardware accelerator. You
+    can substitute the unsupported operators with similar supported operators.
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 6198798978f..5bfe60e1e2a 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -89,6 +89,9 @@ interface as the original float only model.
 [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
 and [Coral Edge TPUs](https://coral.ai/).*
 
+Note: Starting TensorFlow 2.3.0, we support the `inference_input_type` and
+`inference_output_type` attributes.
+
 Additionally, to ensure compatibility with integer only devices (such as 8-bit
 microcontrollers) and accelerators (such as the Coral Edge TPU), you can enforce
 full integer quantization for all ops including the input and output, by using
diff --git a/tensorflow/lite/g3doc/tutorials/_index.yaml b/tensorflow/lite/g3doc/tutorials/_index.yaml
index 06d5e780cd7..287e921ed7a 100644
--- a/tensorflow/lite/g3doc/tutorials/_index.yaml
+++ b/tensorflow/lite/g3doc/tutorials/_index.yaml
@@ -132,18 +132,18 @@ landing_page:
     items:
     - classname: tfo-landing-page-card
       description: >
-        <a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">
+        <a href="https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0">
           <h3 class="no-link">Hotword detection</h3>
         </a>
         Train a tiny speech model that can detect simple hotwords.
-      path: https://github.com/tensorflow/examples/blob/master/lite/examples/object_detection/raspberry_pi/
+      path: https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0
     - classname: tfo-landing-page-card
       description: >
-        <a href="https://codelabs.developers.google.com/codelabs/recognize-flowers-with-tensorflow-on-android/#0">
+        <a href="https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html">
           <h3 class="no-link">Gesture recognition</h3>
         </a>
         Train a model that can recognize different gestures using accelerometer data.
-      path: https://github.com/tensorflow/examples/blob/master/lite/examples/object_detection/raspberry_pi/
+      path: https://blog.tensorflow.org/2019/11/how-to-get-started-with-machine.html
 
 
   # Next steps
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 4f81824d96f..a79ea86f61e 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -86,9 +86,8 @@ TfLiteQuantization GetQuantizationFromLegacy(
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter : DefaultErrorReporter()),
-      lazy_delegate_provider_(
-          TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {})) {
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
   // TODO(b/128420794): Include the TFLite runtime version in the log.
   // Prod logging is useful for mobile platforms where scraping console logs is
   // critical for debugging.
@@ -184,21 +183,53 @@ TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
 TfLiteStatus Interpreter::AllocateTensors() {
   // Apply the default delegate that TFLite will enable at this point to allow
   // other user-level delegates to be applied first.
-  if (lazy_delegate_provider_) {
-    // The execution will fall back to default implementation if the XNNPACK
-    // delegate fails to be applied. Therefore, we ignore the return status
-    // here and let it fall through the rest of the code.
-    auto status = ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
-    if (status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(
-          error_reporter_,
-          "Ignoring failed application of the default TensorFlow Lite "
-          "delegate.");
-    } else {
-      TFLITE_LOG(TFLITE_LOG_INFO,
-                 "Successfully applied the default TensorFlow Lite delegate.");
+  if (!lazy_delegate_providers_.empty()) {
+    TFLITE_LOG(TFLITE_LOG_INFO,
+               "Applying %zu TensorFlow Lite delegate(s) lazily.",
+               lazy_delegate_providers_.size());
+    // At the momement, XNNPACK delegate is the only one that might be applied
+    // by default, in which case, the execution will fall back to default
+    // implementation if the XNNPACK delegate fails to be applied. Therefore, we
+    // ignore the return status here and let it fall through the rest of the
+    // code.
+    for (size_t i = 0; i < lazy_delegate_providers_.size(); ++i) {
+      auto status =
+          ModifyGraphWithDelegate(std::move(lazy_delegate_providers_[i]));
+      switch (status) {
+        case kTfLiteOk:
+          TFLITE_LOG(TFLITE_LOG_INFO,
+                     "Successfully applied the default TensorFlow Lite "
+                     "delegate indexed at %zu.",
+                     i);
+          break;
+        case kTfLiteError:
+          TF_LITE_REPORT_ERROR(error_reporter_,
+                               "Failed to apply the default TensorFlow Lite "
+                               "delegate indexed at %zu.",
+                               i);
+          return kTfLiteError;
+        case kTfLiteDelegateError:
+          TF_LITE_REPORT_ERROR(
+              error_reporter_,
+              "Error in applying the default TensorFlow Lite delegate indexed "
+              "at %zu, and all previously applied delegates are reverted.",
+              i);
+          break;
+        case kTfLiteApplicationError:
+          TF_LITE_REPORT_ERROR(error_reporter_,
+                               "Ignoring failed application of the default "
+                               "TensorFlow Lite delegate indexed at %zu.",
+                               i);
+          break;
+        default:
+          TF_LITE_REPORT_ERROR(error_reporter_,
+                               "Unknown status (%d) after applying the default "
+                               "TensorFlow Lite delegate indexed at %zu.",
+                               status, i);
+          return kTfLiteError;
+      }
     }
-    lazy_delegate_provider_.reset();
+    lazy_delegate_providers_.clear();
   }
 
   return primary_subgraph().AllocateTensors();
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index d4bf3016810..f27a17dfafe 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -653,10 +653,10 @@ class Interpreter {
   // A map of resources. Owned by interpreter and shared by multiple subgraphs.
   resource::ResourceMap resources_;
 
-  // Indicating a delegate that the TFLite interpreter will apply by default.
-  // A nullptr value means there's no delegate to be applied by default or the
-  // delegate has been applied and doesn't need to be applied again.
-  TfLiteDelegatePtr lazy_delegate_provider_;
+  // Indicating delegates that the TFLite interpreter will apply by default.
+  // An empty one means there's no delegate to be applied by default or
+  // delegates have been applied and doesn't need to be applied again.
+  std::vector<TfLiteDelegatePtr> lazy_delegate_providers_;
 };
 
 }  // namespace impl
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 07c5251fab3..0765f00faf3 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/shared_library.h"
-#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -675,8 +674,8 @@ TfLiteStatus InterpreterBuilder::operator()(
   }
 
   if (num_fp32_tensors_ > 0) {
-    (*interpreter)->lazy_delegate_provider_ =
-        MaybeCreateXNNPACKDelegate(num_threads);
+    (*interpreter)->lazy_delegate_providers_ =
+        op_resolver_.GetDelegates(num_threads);
   }
 
   if (ApplyDelegates(interpreter->get(), num_threads) != kTfLiteOk)
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index bb62b44f9cb..4665f59ac4a 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -326,58 +326,55 @@ public class Camera2BasicFragment extends Fragment
     final int deviceIndex = deviceView.getCheckedItemPosition();
     final int numThreads = np.getValue();
 
-    backgroundHandler.post(() -> {
-      if (modelIndex == currentModel && deviceIndex == currentDevice
+    backgroundHandler.post(
+        () -> {
+          if (modelIndex == currentModel
+              && deviceIndex == currentDevice
               && numThreads == currentNumThreads) {
-        return;
-      }
-      currentModel = modelIndex;
-      currentDevice = deviceIndex;
-      currentNumThreads = numThreads;
+            return;
+          }
+          currentModel = modelIndex;
+          currentDevice = deviceIndex;
+          currentNumThreads = numThreads;
 
-      // Disable classifier while updating
-      if (classifier != null) {
-        classifier.close();
-        classifier = null;
-      }
+          // Disable classifier while updating
+          if (classifier != null) {
+            classifier.close();
+            classifier = null;
+          }
 
-      // Lookup names of parameters.
-      String model = modelStrings.get(modelIndex);
-      String device = deviceStrings.get(deviceIndex);
+          // Lookup names of parameters.
+          String model = modelStrings.get(modelIndex);
+          String device = deviceStrings.get(deviceIndex);
 
-      Log.i(TAG, "Changing model to " + model + " device " + device);
+          Log.i(TAG, "Changing model to " + model + " device " + device);
 
-      // Try to load model.
-      try {
-        if (model.equals(mobilenetV1Quant)) {
-          classifier = new ImageClassifierQuantizedMobileNet(getActivity());
-        } else if (model.equals(mobilenetV1Float)) {
-          classifier = new ImageClassifierFloatMobileNet(getActivity());
-        } else {
-          showToast("Failed to load model");
-        }
-      } catch (IOException e) {
-        Log.d(TAG, "Failed to load", e);
-        classifier = null;
-      }
+          // Try to load model.
+          try {
+            if (model.equals(mobilenetV1Quant)) {
+              classifier = new ImageClassifierQuantizedMobileNet(getActivity());
+            } else if (model.equals(mobilenetV1Float)) {
+              classifier = new ImageClassifierFloatMobileNet(getActivity());
+            } else {
+              showToast("Failed to load model");
+            }
+          } catch (IOException e) {
+            Log.d(TAG, "Failed to load", e);
+            classifier = null;
+          }
 
-      // Customize the interpreter to the type of device we want to use.
-      if (classifier == null) {
-        return;
-      }
-      classifier.setNumThreads(numThreads);
-      if (device.equals(cpu)) {
-      } else if (device.equals(gpu)) {
-        if (model.equals(mobilenetV1Quant)) {
-          showToast("gpu requires float model.");
-          classifier = null;
-        } else {
-          classifier.useGpu();
-        }
-      } else if (device.equals(nnApi)) {
-        classifier.useNNAPI();
-      }
-    });
+          // Customize the interpreter to the type of device we want to use.
+          if (classifier == null) {
+            return;
+          }
+          classifier.setNumThreads(numThreads);
+          if (device.equals(cpu)) {
+          } else if (device.equals(gpu)) {
+            classifier.useGpu();
+          } else if (device.equals(nnApi)) {
+            classifier.useNNAPI();
+          }
+        });
   }
 
   /** Connect the buttons to their event handler. */
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 2e483d89216..21149c9e0a3 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -172,7 +172,10 @@ public abstract class ImageClassifier {
 
   public void useGpu() {
     if (gpuDelegate == null) {
-      gpuDelegate = new GpuDelegate();
+      GpuDelegate.Options options = new GpuDelegate.Options();
+      options.setQuantizedModelsAllowed(true);
+
+      gpuDelegate = new GpuDelegate(options);
       tfliteOptions.addDelegate(gpuDelegate);
       recreateInterpreter();
     }
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index a56d370afeb..3bbfdd9b901 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -697,16 +697,16 @@ cc_test(
 
 cc_library(
     name = "custom_ops",
-    srcs = ["rfft2d.cc"],
+    srcs = [
+        "complex_support.cc",
+        "rfft2d.cc",
+    ],
     hdrs = ["custom_ops_register.h"],
     copts = tflite_copts(),
     deps = [
         ":kernel_util",
-        ":op_macros",
-        "//tensorflow/lite:context",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/hashtable:hashtable_op_kernels",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:types",
         "//third_party/fft2d:fft2d_headers",
@@ -757,6 +757,7 @@ cc_library(
     deps = [
         ":builtin_op_kernels",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:tflite_with_xnnpack_optional",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -774,6 +775,7 @@ cc_library(
     deps = [
         ":builtin_op_kernels",
         "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:tflite_with_xnnpack_optional",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -791,6 +793,7 @@ cc_library(
     deps = [
         ":builtin_op_kernels_ruy_and_caching",
         "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:tflite_with_xnnpack_optional",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -2285,4 +2288,19 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "complex_support_test",
+    srcs = ["complex_support_test.cc"],
+    deps = [
+        ":custom_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/kernels/complex_support.cc b/tensorflow/lite/kernels/complex_support.cc
new file mode 100644
index 00000000000..7f5886c2e51
--- /dev/null
+++ b/tensorflow/lite/kernels/complex_support.cc
@@ -0,0 +1,146 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <complex>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+// TODO(b/165735381): Promote this op to builtin-op when we can add new builtin
+// ops.
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace complex {
+
+static const int kInputTensor = 0;
+static const int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TF_LITE_ENSURE(context, input->type == kTfLiteComplex64 ||
+                              input->type == kTfLiteComplex128);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (input->type == kTfLiteComplex64) {
+    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  } else {
+    TF_LITE_ENSURE(context, output->type = kTfLiteFloat64);
+  }
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename T, typename ExtractF>
+void ExtractData(const TfLiteTensor* input, ExtractF extract_func,
+                 TfLiteTensor* output) {
+  const std::complex<T>* input_data = GetTensorData<std::complex<T>>(input);
+  T* output_data = GetTensorData<T>(output);
+  const int input_size = NumElements(input);
+  for (int i = 0; i < input_size; ++i) {
+    *output_data++ = extract_func(*input_data++);
+  }
+}
+
+TfLiteStatus EvalReal(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input->type) {
+    case kTfLiteComplex64: {
+      ExtractData<float>(
+          input,
+          static_cast<float (*)(const std::complex<float>&)>(std::real<float>),
+          output);
+      break;
+    }
+    case kTfLiteComplex128: {
+      ExtractData<double>(input,
+                          static_cast<double (*)(const std::complex<double>&)>(
+                              std::real<double>),
+                          output);
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unsupported input type, Real op only supports "
+                         "complex input, but got: ",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalImag(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input->type) {
+    case kTfLiteComplex64: {
+      ExtractData<float>(
+          input,
+          static_cast<float (*)(const std::complex<float>&)>(std::imag<float>),
+          output);
+      break;
+    }
+    case kTfLiteComplex128: {
+      ExtractData<double>(input,
+                          static_cast<double (*)(const std::complex<double>&)>(
+                              std::imag<double>),
+                          output);
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unsupported input type, Imag op only supports "
+                         "complex input, but got: ",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace complex
+
+TfLiteRegistration* Register_REAL() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 complex::Prepare, complex::EvalReal};
+  return &r;
+}
+
+TfLiteRegistration* Register_IMAG() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 complex::Prepare, complex::EvalImag};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/complex_support_test.cc b/tensorflow/lite/kernels/complex_support_test.cc
new file mode 100644
index 00000000000..cb60345010b
--- /dev/null
+++ b/tensorflow/lite/kernels/complex_support_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <complex>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_REAL();
+TfLiteRegistration* Register_IMAG();
+
+namespace {
+
+template <typename T>
+class RealOpModel : public SingleOpModel {
+ public:
+  RealOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+
+    output_ = AddOutput(output);
+
+    const std::vector<uint8_t> custom_option;
+    SetCustomOp("Real", custom_option, Register_REAL);
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(RealOpTest, SimpleFloatTest) {
+  RealOpModel<float> m({TensorType_COMPLEX64, {2, 4}},
+                       {TensorType_FLOAT32, {}});
+
+  m.PopulateTensor<std::complex<float>>(m.input(), {{75, 0},
+                                                    {-6, -1},
+                                                    {9, 0},
+                                                    {-10, 5},
+                                                    {-3, 2},
+                                                    {-6, 11},
+                                                    {0, 0},
+                                                    {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {75, -6, 9, -10, -3, -6, 0, 22.1f})));
+}
+
+TEST(RealOpTest, SimpleDoubleTest) {
+  RealOpModel<double> m({TensorType_COMPLEX128, {2, 4}},
+                        {TensorType_FLOAT64, {}});
+
+  m.PopulateTensor<std::complex<double>>(m.input(), {{75, 0},
+                                                     {-6, -1},
+                                                     {9, 0},
+                                                     {-10, 5},
+                                                     {-3, 2},
+                                                     {-6, 11},
+                                                     {0, 0},
+                                                     {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {75, -6, 9, -10, -3, -6, 0, 22.1f})));
+}
+
+template <typename T>
+class ImagOpModel : public SingleOpModel {
+ public:
+  ImagOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+
+    output_ = AddOutput(output);
+
+    const std::vector<uint8_t> custom_option;
+    SetCustomOp("Imag", custom_option, Register_IMAG);
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ImagOpTest, SimpleFloatTest) {
+  ImagOpModel<float> m({TensorType_COMPLEX64, {2, 4}},
+                       {TensorType_FLOAT32, {}});
+
+  m.PopulateTensor<std::complex<float>>(m.input(), {{75, 7},
+                                                    {-6, -1},
+                                                    {9, 3.5},
+                                                    {-10, 5},
+                                                    {-3, 2},
+                                                    {-6, 11},
+                                                    {0, 0},
+                                                    {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {7, -1, 3.5f, 5, 2, 11, 0, 33.3f})));
+}
+
+TEST(ImagOpTest, SimpleDoubleTest) {
+  ImagOpModel<double> m({TensorType_COMPLEX128, {2, 4}},
+                        {TensorType_FLOAT64, {}});
+
+  m.PopulateTensor<std::complex<double>>(m.input(), {{75, 7},
+                                                     {-6, -1},
+                                                     {9, 3.5},
+                                                     {-10, 5},
+                                                     {-3, 2},
+                                                     {-6, 11},
+                                                     {0, 0},
+                                                     {22.1, 33.3}});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(ArrayFloatNear(
+                                 {7, -1, 3.5f, 5, 2, 11, 0, 33.3f})));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/custom_ops_register.h b/tensorflow/lite/kernels/custom_ops_register.h
index 3abc893243b..bf4c3d7e473 100644
--- a/tensorflow/lite/kernels/custom_ops_register.h
+++ b/tensorflow/lite/kernels/custom_ops_register.h
@@ -26,6 +26,8 @@ TfLiteRegistration* Register_HASHTABLE();
 TfLiteRegistration* Register_HASHTABLE_FIND();
 TfLiteRegistration* Register_HASHTABLE_IMPORT();
 TfLiteRegistration* Register_HASHTABLE_SIZE();
+TfLiteRegistration* Register_REAL();
+TfLiteRegistration* Register_IMAG();
 }
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 61c6aeaa811..d23cdedc6c8 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <cmath>
+#include <limits>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -31,6 +33,22 @@ namespace builtin {
 namespace elementwise {
 namespace {
 
+constexpr char kAbsName[] = "Abs";
+constexpr char kSinName[] = "Sin";
+constexpr char kCosName[] = "Cos";
+constexpr char kLogName[] = "Log";
+constexpr char kSqrtName[] = "Sqrt";
+constexpr char kRsqrtName[] = "Rsqrt";
+constexpr char kSquareName[] = "Square";
+constexpr char kNotName[] = "Not";
+
+struct OpData {
+  int32_t multiplier;
+  int32_t shift;
+  int input_offset;
+  int output_offset;
+};
+
 bool IsNumericSupportedType(const TfLiteType type) {
   return type == kTfLiteFloat32;
 }
@@ -39,6 +57,10 @@ bool IsLogicalSupportedType(const TfLiteType type) {
   return type == kTfLiteBool;
 }
 
+bool IsAbsSupportedType(const TfLiteType type) {
+  return type == kTfLiteFloat32 || type == kTfLiteInt8;
+}
+
 typedef bool (*IsSupportedType)(TfLiteType);
 template <IsSupportedType is_supported_type, const char* op_name>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -54,9 +76,44 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus AbsPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(
+      context, (GenericPrepare<IsAbsSupportedType, kAbsName>(context, node)),
+      kTfLiteOk);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  if (input->type == kTfLiteInt8) {
+    TfLiteTensor* output = GetOutput(context, node, 0);
+    auto* op_data = static_cast<OpData*>(node->user_data);
+    TF_LITE_ENSURE_EQ(context, input->quantization.type,
+                      kTfLiteAffineQuantization);
+    TF_LITE_ENSURE_EQ(context, output->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* input_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
+    const auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+        output->quantization.params);
+    TF_LITE_ENSURE(context, input_params != nullptr);
+    TF_LITE_ENSURE(context, input_params->scale != nullptr);
+    TF_LITE_ENSURE(context, input_params->scale->size > 0);
+    TF_LITE_ENSURE(context, input_params->zero_point->size > 0);
+    TF_LITE_ENSURE(context, output_params != nullptr);
+    TF_LITE_ENSURE(context, output_params->scale != nullptr);
+    TF_LITE_ENSURE(context, output_params->scale->size > 0);
+    TF_LITE_ENSURE(context, output_params->zero_point->size > 0);
+    op_data->input_offset = input_params->zero_point->data[0];
+    op_data->output_offset = output_params->zero_point->data[0];
+    const float input_scale = input_params->scale->data[0];
+    const float output_scale = output_params->scale->data[0];
+    double scale = input_scale / output_scale;
+    QuantizeMultiplier(scale, &op_data->multiplier, &op_data->shift);
+  }
+  return kTfLiteOk;
+}
+
 template <typename T>
 inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
-                             T func(T), TfLiteType expected_type) {
+                             std::function<T(T)> func,
+                             TfLiteType expected_type) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
@@ -79,8 +136,39 @@ inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
   return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
 }
 
+void* AbsInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new OpData();
+}
+
+void AbsFree(TfLiteContext* context, void* buffer) {
+  delete static_cast<OpData*>(buffer);
+}
+
 TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::abs);
+  const TfLiteType type = GetInput(context, node, 0)->type;
+  switch (type) {
+    case kTfLiteFloat32:
+      return EvalImpl<float>(context, node, std::abs<float>, type);
+    case kTfLiteInt8: {
+      const auto* op_data = static_cast<const OpData*>(node->user_data);
+      const int kMinInt8 = std::numeric_limits<int8_t>::min();
+      const int kMaxInt8 = std::numeric_limits<int8_t>::max();
+      std::function<int8_t(int8_t)> func = [&](int8_t i) {
+        const int32_t value = std::abs(i - op_data->input_offset);
+        return std::min(
+            std::max(op_data->output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             value, op_data->multiplier, op_data->shift),
+                     kMinInt8),
+            kMaxInt8);
+      };
+      return EvalImpl<int8_t>(context, node, func, type);
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Current data type %s is not supported.",
+                         TfLiteTypeGetName(type));
+      return kTfLiteError;
+  }
 }
 
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
@@ -111,24 +199,12 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
-constexpr char kAbsName[] = "Abs";
-constexpr char kSinName[] = "Sin";
-constexpr char kCosName[] = "Cos";
-constexpr char kLogName[] = "Log";
-constexpr char kSqrtName[] = "Sqrt";
-constexpr char kRsqrtName[] = "Rsqrt";
-constexpr char kSquareName[] = "Square";
-constexpr char kNotName[] = "Not";
-
 }  // namespace
 }  // namespace elementwise
 
 TfLiteRegistration* Register_ABS() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
-                                  elementwise::kAbsName>,
-      elementwise::AbsEval};
+  static TfLiteRegistration r = {elementwise::AbsInit, elementwise::AbsFree,
+                                 elementwise::AbsPrepare, elementwise::AbsEval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 9495be0e590..e0f198f8f9b 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -47,6 +47,44 @@ class ElementWiseOpFloatModel : public ElementWiseOpBaseModel {
   }
 };
 
+class ElementWiseOpInt8Model : public ElementWiseOpBaseModel {
+ public:
+  ElementWiseOpInt8Model(BuiltinOperator op, TensorData input_tensor_data,
+                         TensorData output_tensor_data) {
+    input_ = AddInput(input_tensor_data);
+    output_ = AddOutput(output_tensor_data);
+    SetBuiltinOp(op, BuiltinOptions_NONE, 0);
+    BuildInterpreter({input_tensor_data.shape});
+  }
+
+  template <typename T>
+  void AsymmetricQuantizeAndPopulate(int index,
+                                     const std::vector<float>& data) {
+    std::vector<int8_t> q(data.size());
+    float scaling_factor;
+    int zero_point;
+    tensor_utils::AsymmetricQuantizeFloats(data.data(), data.size(), q.data(),
+                                           &scaling_factor, &zero_point);
+    PopulateTensor<T>(index, /*offset=*/0, reinterpret_cast<T*>(q.data()),
+                      reinterpret_cast<T*>(q.data() + q.size()));
+  }
+
+  template <typename T>
+  std::vector<float> ExtractDequantVector(int index) {
+    auto vec = ExtractVector<T>(index);
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    float scaling_factor = affine_quantization->scale->data[0];
+    int zero_point = affine_quantization->zero_point->data[0];
+    std::vector<float> output;
+    for (const auto& v : vec) {
+      output.push_back((static_cast<T>(v) - zero_point) * scaling_factor);
+    }
+    return output;
+  }
+};
+
 class ElementWiseOpBoolModel : public ElementWiseOpBaseModel {
  public:
   ElementWiseOpBoolModel(BuiltinOperator op,
@@ -98,6 +136,36 @@ TEST(FloatActivationsOpTest, Abs) {
                                                   }));
 }
 
+TEST(FloatActivationsOpTest, AbsInt8) {
+  std::vector<float> data = {15., 46., 78., -142., -1., -17., -49., 113.};
+  std::vector<float> abs_data(data.size());
+  for (int i = 0; i < abs_data.size(); i++) {
+    abs_data[i] = std::abs(data[i]);
+  }
+  const auto minmax = std::minmax_element(data.begin(), data.end());
+  const float abs_max = std::max(std::abs(*minmax.first), *minmax.second);
+  const float kInputScale = (*minmax.second - *minmax.first) / 255.0;
+  const float kOutputScale = abs_max / 255.0;
+  const int input_zero_point = 127 - *minmax.second;
+  const int output_zero_point = -128;
+  ElementWiseOpInt8Model m(
+      BuiltinOperator_ABS,
+      {TensorType_INT8,
+       {1, 8},
+       *minmax.first,
+       *minmax.second,
+       kInputScale,
+       input_zero_point,
+       true,
+       {kInputScale},
+       {input_zero_point}},
+      {TensorType_INT8, {1, 8}, 0, abs_max, kOutputScale, output_zero_point});
+  m.AsymmetricQuantizeAndPopulate<int8_t>(m.input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.ExtractDequantVector<int8_t>(m.output()),
+              ElementsAreArray(ArrayFloatNear(abs_data, kInputScale)));
+}
+
 TEST(ElementWise, Sqrt) {
   ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index ad11c06eb37..2588d4f076f 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -490,6 +490,7 @@ cc_library(
             "reference/integer_ops/mean.h",
             "reference/integer_ops/transpose_conv.h",
             "reference/reference_ops.h",
+            "reference/string_comparisons.h",
             "reference/sparse_ops/fully_connected.h",
         ],
     }),
@@ -561,6 +562,7 @@ cc_library(
         "reference/round.h",
         "reference/softmax.h",
         "reference/strided_slice.h",
+        "reference/string_comparisons.h",
         "reference/sub.h",
         "reference/tanh.h",
     ],
@@ -598,9 +600,14 @@ cc_library(
 cc_library(
     name = "tensor",
     hdrs = [
-        "tensor.h",
+        "portable_tensor.h",
         "tensor_ctypes.h",
-    ],
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "tensor.h",
+        ],
+    }),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -613,9 +620,14 @@ cc_library(
 cc_library(
     name = "reference",
     hdrs = [
-        "tensor.h",
+        "portable_tensor.h",
         "tensor_ctypes.h",
-    ],
+    ] + select({
+        ":tf_lite_static_memory": [],
+        "//conditions:default": [
+            "tensor.h",
+        ],
+    }),
     copts = tflite_copts(),
     deps = [
         ":types",
diff --git a/tensorflow/lite/kernels/internal/portable_tensor.h b/tensorflow/lite/kernels/internal/portable_tensor.h
new file mode 100644
index 00000000000..8b0f6d1e535
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
+
+#include <complex>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
+  return RuntimeShape(data.size(), data.data());
+}
+
+// A list of tensors in a format that can be used by kernels like split and
+// concatenation.
+template <typename T>
+class VectorOfTensors {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfTensors(const TfLiteContext& context,
+                  const TfLiteIntArray& tensor_list) {
+    int num_tensors = tensor_list.size;
+
+    all_data_.reserve(num_tensors);
+    all_shape_.reserve(num_tensors);
+    all_shape_ptr_.reserve(num_tensors);
+
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      all_data_.push_back(GetTensorData<T>(t));
+      all_shape_.push_back(GetTensorShape(t));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_shape in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_tensors; ++i) {
+      all_shape_ptr_.push_back(&all_shape_[i]);
+    }
+  }
+  // Return a pointer to the data pointers of all tensors in the list. For
+  // example:
+  //   float* const* f = v.data();
+  //   f[0][1] is the second element of the first tensor.
+  T* const* data() const { return all_data_.data(); }
+
+  // Return a pointer the shape pointers of all tensors in the list. For
+  // example:
+  //   const RuntimeShape* const* d = v.dims();
+  //   dims[1] are the dimensions of the second tensor in the list.
+  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
+
+ private:
+  std::vector<T*> all_data_;
+  std::vector<RuntimeShape> all_shape_;
+  std::vector<RuntimeShape*> all_shape_ptr_;
+};
+
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8_t>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32_t* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32_t> zero_point_;
+  std::vector<float> scale_;
+};
+
+// Writes randomly accessed values from `input` sequentially into `output`.
+template <typename T>
+class SequentialTensorWriter {
+ public:
+  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
+    input_data_ = GetTensorData<T>(input);
+    output_ptr_ = GetTensorData<T>(output);
+  }
+  SequentialTensorWriter(const T* input_data, T* output_data)
+      : input_data_(input_data), output_ptr_(output_data) {}
+
+  void Write(int position) { *output_ptr_++ = input_data_[position]; }
+  void WriteN(int position, int len) {
+    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
+    output_ptr_ += len;
+  }
+
+ private:
+  const T* input_data_;
+  T* output_ptr_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_PORTABLE_TENSOR_H_
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index 49844ab1539..6344bdc72f9 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
@@ -51,18 +50,6 @@ inline bool LessEqualFn(T lhs, T rhs) {
   return lhs <= rhs;
 }
 
-inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  if (lhs.len != rhs.len) return false;
-  for (int i = 0; i < lhs.len; ++i) {
-    if (lhs.str[i] != rhs.str[i]) return false;
-  }
-  return true;
-}
-
-inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  return !StringRefEqualFn(lhs, rhs);
-}
-
 template <typename T>
 using ComparisonFn = bool (*)(T, T);
 
@@ -78,22 +65,6 @@ inline void ComparisonImpl(
   }
 }
 
-inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
-                                 const RuntimeShape& input1_shape,
-                                 const TfLiteTensor* input1,
-                                 const RuntimeShape& input2_shape,
-                                 const TfLiteTensor* input2,
-                                 const RuntimeShape& output_shape,
-                                 bool* output_data) {
-  const int64_t flatsize =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int64_t i = 0; i < flatsize; ++i) {
-    const auto lhs = GetString(input1, i);
-    const auto rhs = GetString(input2, i);
-    output_data[i] = F(lhs, rhs);
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void Comparison(const ComparisonParams& op_params,
                        const RuntimeShape& input1_shape,
@@ -180,31 +151,6 @@ inline void BroadcastComparison4DSlowImpl(
   }
 }
 
-inline void BroadcastComparison4DSlowStringImpl(
-    bool (*F)(const StringRef&, const StringRef&),
-    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
-    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
-    const RuntimeShape& unextended_output_shape, bool* output_data) {
-  const BroadcastComparison4DSlowCommon dims =
-      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
-                                          unextended_input2_shape,
-                                          unextended_output_shape);
-
-  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
-    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
-      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
-        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const auto lhs =
-              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
-          const auto rhs =
-              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
-          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
-        }
-      }
-    }
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                       const RuntimeShape& input1_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/conv.h b/tensorflow/lite/kernels/internal/reference/conv.h
index d4bf46a86b8..b912ac1b3a4 100644
--- a/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/conv.h
@@ -59,28 +59,31 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           float total = 0.f;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  float input_value = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  total += (input_value * filter_value);
-                }
+                float input_value = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                float filter_value = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                total += (input_value * filter_value);
               }
             }
           }
@@ -139,29 +142,32 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                acc +=
+                    (filter_val + filter_offset) * (input_val + input_offset);
               }
             }
           }
@@ -258,5 +264,4 @@ inline void HybridConvPerChannel(
 }  // namespace reference_ops
 }  // namespace tflite
 
-
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 43b0265fd52..df771bcca27 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/round.h"
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
 #include "tensorflow/lite/kernels/internal/reference/sub.h"
 #include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
@@ -2383,6 +2384,10 @@ template <typename D, typename T>
 void SelectTrueCoords(const RuntimeShape& input_condition_shape,
                       const D* input_condition_data, T* output_data) {
   const size_t size = input_condition_shape.FlatSize();
+  if (size == 0) {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
   const size_t cond_rank = input_condition_shape.DimensionsCount();
 
   std::vector<int> dims_to_count(cond_rank, 0);
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index b035b433a0b..ee5bd1e902f 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -49,15 +49,15 @@ inline void Softmax(const SoftmaxParams& params,
     // Compute sum.
     float sum = 0.f;
     for (int c = 0; c < depth; ++c) {
-      sum += std::exp((input_data[i * depth + c] - max) *
-                      static_cast<float>(params.beta));
+      const float exp_c = std::exp((input_data[i * depth + c] - max) *
+                                   static_cast<float>(params.beta));
+      output_data[i * depth + c] = exp_c;
+      sum += exp_c;
     }
 
     // Compute result.
     for (int c = 0; c < depth; ++c) {
-      output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
-                                            static_cast<float>(params.beta)) /
-                                   sum;
+      output_data[i * depth + c] = output_data[i * depth + c] / sum;
     }
   }
 }
diff --git a/tensorflow/lite/kernels/internal/reference/string_comparisons.h b/tensorflow/lite/kernels/internal/reference/string_comparisons.h
new file mode 100644
index 00000000000..61c43ac73f0
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/string_comparisons.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  if (lhs.len != rhs.len) return false;
+  for (int i = 0; i < lhs.len; ++i) {
+    if (lhs.str[i] != rhs.str[i]) return false;
+  }
+  return true;
+}
+
+inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
+  return !StringRefEqualFn(lhs, rhs);
+}
+
+inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
+                                 const RuntimeShape& input1_shape,
+                                 const TfLiteTensor* input1,
+                                 const RuntimeShape& input2_shape,
+                                 const TfLiteTensor* input2,
+                                 const RuntimeShape& output_shape,
+                                 bool* output_data) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const auto lhs = GetString(input1, i);
+    const auto rhs = GetString(input2, i);
+    output_data[i] = F(lhs, rhs);
+  }
+}
+
+inline void BroadcastComparison4DSlowStringImpl(
+    bool (*F)(const StringRef&, const StringRef&),
+    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
+    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
+    const RuntimeShape& unextended_output_shape, bool* output_data) {
+  const BroadcastComparison4DSlowCommon dims =
+      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
+                                          unextended_input2_shape,
+                                          unextended_output_shape);
+
+  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
+    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
+      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
+        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
+          const auto lhs =
+              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
+          const auto rhs =
+              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
+          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 905552fc640..84de43caeb5 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -15,112 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
 
-#include <complex>
-#include <vector>
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/internal/types.h"
+// Most functionality has been moved into a version of this file that doesn't
+// rely on std::string, so that it can be used in TFL Micro.
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
-inline RuntimeShape GetTensorShape(std::vector<int32_t> data) {
-  return RuntimeShape(data.size(), data.data());
-}
-
-// A list of tensors in a format that can be used by kernels like split and
-// concatenation.
-template <typename T>
-class VectorOfTensors {
- public:
-  // Build with the tensors in 'tensor_list'.
-  VectorOfTensors(const TfLiteContext& context,
-                  const TfLiteIntArray& tensor_list) {
-    int num_tensors = tensor_list.size;
-
-    all_data_.reserve(num_tensors);
-    all_shape_.reserve(num_tensors);
-    all_shape_ptr_.reserve(num_tensors);
-
-    for (int i = 0; i < num_tensors; ++i) {
-      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-      all_data_.push_back(GetTensorData<T>(t));
-      all_shape_.push_back(GetTensorShape(t));
-    }
-
-    // Taking the pointer from inside a std::vector is only OK if the vector is
-    // never modified, so we populate all_shape in the previous loop and then we
-    // are free to grab iterators here.
-    for (int i = 0; i < num_tensors; ++i) {
-      all_shape_ptr_.push_back(&all_shape_[i]);
-    }
-  }
-  // Return a pointer to the data pointers of all tensors in the list. For
-  // example:
-  //   float* const* f = v.data();
-  //   f[0][1] is the second element of the first tensor.
-  T* const* data() const { return all_data_.data(); }
-
-  // Return a pointer the shape pointers of all tensors in the list. For
-  // example:
-  //   const RuntimeShape* const* d = v.dims();
-  //   dims[1] are the dimensions of the second tensor in the list.
-  const RuntimeShape* const* shapes() const { return all_shape_ptr_.data(); }
-
- private:
-  std::vector<T*> all_data_;
-  std::vector<RuntimeShape> all_shape_;
-  std::vector<RuntimeShape*> all_shape_ptr_;
-};
-
-// A list of quantized tensors in a format that can be used by kernels like
-// split and concatenation.
-class VectorOfQuantizedTensors : public VectorOfTensors<uint8_t> {
- public:
-  // Build with the tensors in 'tensor_list'.
-  VectorOfQuantizedTensors(const TfLiteContext& context,
-                           const TfLiteIntArray& tensor_list)
-      : VectorOfTensors<uint8_t>(context, tensor_list) {
-    for (int i = 0; i < tensor_list.size; ++i) {
-      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-      zero_point_.push_back(t->params.zero_point);
-      scale_.push_back(t->params.scale);
-    }
-  }
-
-  const float* scale() const { return scale_.data(); }
-  const int32_t* zero_point() const { return zero_point_.data(); }
-
- private:
-  std::vector<int32_t> zero_point_;
-  std::vector<float> scale_;
-};
-
-// Writes randomly accessed values from `input` sequentially into `output`.
-template <typename T>
-class SequentialTensorWriter {
- public:
-  SequentialTensorWriter(const TfLiteTensor* input, TfLiteTensor* output) {
-    input_data_ = GetTensorData<T>(input);
-    output_ptr_ = GetTensorData<T>(output);
-  }
-  SequentialTensorWriter(const T* input_data, T* output_data)
-      : input_data_(input_data), output_ptr_(output_data) {}
-
-  void Write(int position) { *output_ptr_++ = input_data_[position]; }
-  void WriteN(int position, int len) {
-    memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
-    output_ptr_ += len;
-  }
-
- private:
-  const T* input_data_;
-  T* output_ptr_;
-};
-
-// String ops are not yet supported on platforms w/ static memory.
-#ifndef TF_LITE_STATIC_MEMORY
 template <>
 class SequentialTensorWriter<string> {
  public:
@@ -140,7 +41,6 @@ class SequentialTensorWriter<string> {
   TfLiteTensor* output_;
   DynamicBuffer buffer_;
 };
-#endif  // TF_LITE_STATIC_MEMORY
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 74c8c88d953..27d9da84c0a 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -28,8 +28,10 @@ limitations under the License.
 
 namespace tflite {
 
-const TfLiteTensor* GetInput(const TfLiteContext* context,
-                             const TfLiteNode* node, int index) {
+namespace {
+
+inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
+                                     const TfLiteNode* node, int index) {
   if (context->tensors != nullptr) {
     return &context->tensors[node->inputs->data[index]];
   } else {
@@ -37,14 +39,16 @@ const TfLiteTensor* GetInput(const TfLiteContext* context,
   }
 }
 
+}  // anonymous namespace.
+
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index) {
+  return GetMutableInput(context, node, index);
+}
+
 TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
                                int index) {
-  TfLiteTensor* tensor = nullptr;
-  if (context->tensors != nullptr) {
-    tensor = &context->tensors[node->inputs->data[index]];
-  } else {
-    tensor = context->GetTensor(context, node->inputs->data[index]);
-  }
+  TfLiteTensor* tensor = GetMutableInput(context, node, index);
   return tensor->is_variable ? tensor : nullptr;
 }
 
@@ -62,11 +66,7 @@ const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
   const bool use_tensor = index < node->inputs->size &&
                           node->inputs->data[index] != kTfLiteOptionalTensor;
   if (use_tensor) {
-    if (context->tensors != nullptr) {
-      return &context->tensors[node->inputs->data[index]];
-    } else {
-      return context->GetTensor(context, node->inputs->data[index]);
-    }
+    return GetMutableInput(context, node, index);
   }
   return nullptr;
 }
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index fe67d055ded..71ee4208ed9 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -41,8 +41,8 @@ template <typename T>
 TfLiteStatus GetSize(TfLiteContext* context, T start, T limit, T delta,
                      int* size) {
   TF_LITE_ENSURE(context, !std::equal_to<T>()(delta, 0));
-  TF_LITE_ENSURE(context,
-                 (start > limit && delta < 0) || (start < limit && delta > 0));
+  TF_LITE_ENSURE(
+      context, (start >= limit && delta < 0) || (start <= limit && delta > 0));
   *size =
       (std::is_integral<T>::value
            ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
diff --git a/tensorflow/lite/kernels/range_test.cc b/tensorflow/lite/kernels/range_test.cc
index 52f7231def9..bb11d15b000 100644
--- a/tensorflow/lite/kernels/range_test.cc
+++ b/tensorflow/lite/kernels/range_test.cc
@@ -112,5 +112,15 @@ TEST(RangeOpModel, FloatNegativeDelta) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(10, 7, 4));
 }
 
+TEST(RangeOpModel, EmptyOutput) {
+  RangeOpModel<int32_t> model(TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.start(), {0});
+  model.PopulateTensor<int32_t>(model.limit(), {0});
+  model.PopulateTensor<int32_t>(model.delta(), {1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(0));
+  EXPECT_THAT(model.GetOutput(), ElementsAre());
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 1d1db9e0403..e020298fc8f 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 
 namespace tflite {
 namespace ops {
@@ -33,7 +34,8 @@ TfLiteRegistration* Register_DETECTION_POSTPROCESS();
 namespace builtin {
 
 BuiltinOpResolver::BuiltinOpResolver() {
-  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS(), /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU(), /* min_version = */ 1,
              /* max_version = */ 2);
@@ -302,6 +304,21 @@ BuiltinOpResolver::BuiltinOpResolver() {
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
 }
 
+OpResolver::TfLiteDelegatePtrVector BuiltinOpResolver::GetDelegates(
+    int num_threads) const {
+  OpResolver::TfLiteDelegatePtrVector delegates;
+  auto xnnpack_delegate = tflite::MaybeCreateXNNPACKDelegate(num_threads);
+  if (xnnpack_delegate != nullptr) {
+    delegates.push_back(std::move(xnnpack_delegate));
+  }
+  return delegates;
+}
+
+OpResolver::TfLiteDelegatePtrVector
+BuiltinOpResolverWithoutDefaultDelegates::GetDelegates(int num_threads) const {
+  return OpResolver::TfLiteDelegatePtrVector();
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index a2a41ea9428..1a6095c7140 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -22,9 +22,22 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 
+// This built-in op resolver provides a list of TfLite delegates that could be
+// applied by TfLite interpreter by default.
 class BuiltinOpResolver : public MutableOpResolver {
  public:
   BuiltinOpResolver();
+  OpResolver::TfLiteDelegatePtrVector GetDelegates(
+      int num_threads) const override;
+};
+
+// TfLite interpreter could apply a TfLite delegate by default. To completely
+// disable this behavior, one could choose to use the following class
+// BuiltinOpResolverWithoutDefaultDelegates.
+class BuiltinOpResolverWithoutDefaultDelegates : public BuiltinOpResolver {
+ public:
+  BuiltinOpResolverWithoutDefaultDelegates() : BuiltinOpResolver() {}
+  OpResolver::TfLiteDelegatePtrVector GetDelegates(int num_threads) const final;
 };
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/rfft2d.cc b/tensorflow/lite/kernels/rfft2d.cc
index 9aeee53f637..4b0b4a6140a 100644
--- a/tensorflow/lite/kernels/rfft2d.cc
+++ b/tensorflow/lite/kernels/rfft2d.cc
@@ -248,13 +248,15 @@ void Rfft2dReorder(int fft_height, int fft_width, double** fft_input_output) {
     fft_input_output[i][0] = fft_input_output[fft_height - i][0];
     fft_input_output[i][1] = -fft_input_output[fft_height - i][1];
   }
-  fft_input_output[0][fft_width] = fft_input_output[0][1];
+
+  double temp = fft_input_output[0][1];
   fft_input_output[0][fft_width + 1] = 0;
   fft_input_output[0][1] = 0;
   fft_input_output[fft_height_half][fft_width] =
       fft_input_output[fft_height_half][1];
   fft_input_output[fft_height_half][fft_width + 1] = 0;
   fft_input_output[fft_height_half][1] = 0;
+  fft_input_output[0][fft_width] = temp;
 
   // Reorder the frequency matrix from
   //    [[F(0, 0),  F(0, -1/4),   F(0, -2/4)],
diff --git a/tensorflow/lite/kernels/test_main.cc b/tensorflow/lite/kernels/test_main.cc
index a1b1a913281..3b3797890a3 100644
--- a/tensorflow/lite/kernels/test_main.cc
+++ b/tensorflow/lite/kernels/test_main.cc
@@ -31,8 +31,10 @@ void InitKernelTest(int* argc, char** argv) {
     // In Android Q, the NNAPI delegate avoids delegation if the only device
     // is the reference CPU. However, for testing purposes, we still want
     // delegation coverage, so force use of this reference path.
-    delegate_providers->MutableParams()->Set<std::string>(
-        "nnapi_accelerator_name", "nnapi-reference");
+    auto* params = delegate_providers->MutableParams();
+    if (!params->HasValueSet<std::string>("nnapi_accelerator_name")) {
+      params->Set<std::string>("nnapi_accelerator_name", "nnapi-reference");
+    }
   }
 }
 
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index a20efa8baaa..8eb09bf2798 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -90,6 +90,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                       ResizeOutputTensor(context, cond_tensor, output));
   }
 
+  TfLiteIntArray* dims = cond_tensor->dims;
+  if (dims->size == 0) {
+    // Scalar tensors are not supported.
+    TF_LITE_KERNEL_LOG(context, "Where op requires condition w/ rank > 0");
+    return kTfLiteError;
+  }
+
   reference_ops::SelectTrueCoords(GetTensorShape(cond_tensor),
                                   GetTensorData<bool>(cond_tensor),
                                   GetTensorData<int64_t>(output));
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
index ba93bed6e74..4a77470e89f 100644
--- a/tensorflow/lite/kernels/where_test.cc
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -51,6 +51,30 @@ class IntegerWhereOpModel : public BaseWhereOpModel {
   std::vector<int64_t> GetOutput() { return ExtractVector<int64_t>(output_); }
 };
 
+template <typename T1>
+class ConstInputWhereOpModel : public SingleOpModel {
+ public:
+  ConstInputWhereOpModel(T1 constant_values, const TensorData& output) {
+    input_ = AddConstInput(GetTensorType<T1>(), {constant_values}, {});
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_WHERE, BuiltinOptions_WhereOptions,
+                 CreateWhereOptions(builder_).Union());
+    BuildInterpreter({{}});
+  }
+
+  int input() { return input_; }
+  std::vector<int64_t> GetOutput() { return ExtractVector<int64_t>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(WhereOpTest, ScalarValueFail) {
+  ConstInputWhereOpModel<bool> m(false, {TensorType_INT64, {}});
+  EXPECT_EQ(m.InvokeUnchecked(), kTfLiteError);
+}
+
 TEST(WhereOpTest, SelectFromVectorNoResult) {
   IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT64, {}});
   m.PopulateTensor<bool>(m.input(), {false, false, false});
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 7cec8584413..242ea693de4 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -152,7 +152,6 @@ cc_library(
         "micro_string.h",
     ],
     copts = micro_copts(),
-    deps = ["//tensorflow/lite/c:common"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/micro/CONTRIBUTING.md b/tensorflow/lite/micro/CONTRIBUTING.md
new file mode 100644
index 00000000000..f81404a99b6
--- /dev/null
+++ b/tensorflow/lite/micro/CONTRIBUTING.md
@@ -0,0 +1,158 @@
+# Resources
+
+A
+[TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+should be the primary method of getting in touch with the TensorFlow Lite Micro
+(TFLM) team.
+
+The following resources may also be useful:
+
+1.  SIG Micro [email group](https://groups.google.com/a/tensorflow.org/g/micro)
+    and
+    [monthly meetings](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc).
+
+1.  SIG Micro [gitter chat room](https://gitter.im/tensorflow/sig-micro).
+
+# Contributing Guidelines
+
+We look forward to your contributions to the TensorFlow Lite Micro codebase and
+provide guidelines with the goal of enabling community contributions while still
+maintaining code health, maintainability, and consistency in style.
+
+Please note that while these guidelines may seem onerous to some developers,
+they are derived from Google's software engineering best practices.
+
+Before we describe project-specific guidelines, we recommend that external
+contributors read these tips from the Google Testing Blog:
+
+*   [Code Health: Providing Context with Commit Messages and Bug Reports](https://testing.googleblog.com/2017/09/code-health-providing-context-with.html)
+*   [Code Health: Understanding Code In Review](https://testing.googleblog.com/2018/05/code-health-understanding-code-in-review.html)
+*   [Code Health: Too Many Comments on Your Code Reviews?](https://testing.googleblog.com/2017/06/code-health-too-many-comments-on-your.html)
+*   [Code Health: To Comment or Not to Comment?](https://testing.googleblog.com/2017/07/code-health-to-comment-or-not-to-comment.html)
+
+We also recommend that contributors take a look at the
+[Tensorflow Contributing Guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+## General Pull Request Guidelines
+
+We strongly recommend that contributors:
+
+1.  Initiate a conversation with the TFLM team via a
+    [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+    as early as possible.
+
+    *   This enables us to give guidance on how to proceed, prevent duplicated
+        effort and also point to alternatives as well as context if we are not
+        able to accept a particular contribution at a given time.
+
+    *   Ideally, you should make an issue ***before*** starting to work on a
+        pull request and provide context on both what you want to contribute and
+        why.
+
+1.  Once step 1. is complete and it is determined that a PR from an external
+    contributor is the way to go, please follow these guidelines from
+    [Google's Engineering Practices documentation](https://google.github.io/eng-practices/):
+
+    *   [Send Small Pull Requests](https://google.github.io/eng-practices/review/developer/small-cls.html)
+
+        *   If a pull request is doing more than one thing, the reviewer will
+            request that it be broken up into two or more PRs.
+
+    *   [Write Good Pull Request Descriptions](https://google.github.io/eng-practices/review/developer/cl-descriptions.html)
+
+        *   We require that all PR descriptions link to the github issue created
+            in step 1.
+
+        *   While github offers flexibility in linking
+            [commits and issues](https://github.blog/2011-04-09-issues-2-0-the-next-generation/#commits-issues),
+            we require that the PR description have a separate line with either
+            `Fixes #nn` (if the PR fixes the issue) or `Issue #nn` if the PR
+            addresses some aspect of an issue without fixing it.
+
+        *   We will be adding internal checks that automate this requirement by
+            matching the PR description to the regexp: `(Fixes|Issue) #`
+
+## Guidlines for Specific Contribution Categories
+
+We provide some additional guidelines for different categories of contributions.
+
+### Bug Fixes
+
+Pull requests that fix bugs are always welcome and often uncontroversial, unless
+there is a conflict between different requirements from the platform, or if
+fixing a bug needs a bigger architectural change.
+
+1.  Create a
+    [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+    to determine the scope of the bug fix.
+1.  Send a PR (if that is determined to be the best path forward).
+
+### Reference Kernel Implementations
+
+Pull requests that port reference kernels from TF Lite Mobile to TF Lite Micro
+are welcome once we have enouch context from the contributor on why the
+additional kernel is needed.
+
+1.  Please create a
+    [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+    before starting on any such PRs with as much context as possible, such as:
+
+    *   What is the model architecture?
+    *   What is the application that you are targetting?
+    *   What embedded target(s) are you planning to run on?
+    *   Motivate your use-case and the need for adding support for this
+        additional OP.
+
+1.  In the interest of having
+    [small pull requests](https://google.github.io/eng-practices/review/developer/small-cls.html),
+    limit each pull request to porting a single kernel (and the corresponding
+    test).
+
+1.  TODO(b/165627437): Create and link to a guide to porting reference ops.
+
+### Optimized Kernel Implementations
+
+In order to have the TFLM codebase be a central repository of optimized kernel
+implementations, we would like to make some improvements to the current
+infrastructure to enable adding and maintaining optimized kernel implementations
+in a scalable way.
+
+Until that work is complete, we are requesting a ***pause*** on contributions that
+add new optimized kernel implementations. We plan to make these improvements by
+October 2020 and will provide additional guidelines at that time.
+
+*   If you would like to have an exception to this pause, with the understanding
+    that your optimized kernels will break as we improve the underlying
+    framework, then please contact the TFLM team and we can figure out some
+    middle ground.
+
+*   Every optimized kernel directory must have a README.md with the github IDs
+    of the maintainers and any other relevant documentation. PRs that add
+    maintainers to the existing optimized kernels are always welcome.
+
+### New Target / Platform / IDE / Examples
+
+As discussed in the
+[SIG-micro Aug 12, 2020 meeting](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc),
+we are currently ***pausing*** accepting pull requests that add new targets,
+platforms, IDE integration or examples while we revisit some of the
+infrastructure to enable us to make this process easier and more scalable.
+
+In the meantime, snapshotting and/or forking the tensorflow repo could be a
+viable way to prototype platform support.
+
+Having said that, we still invite
+[TF Lite Micro Github issues](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+on this topic as we would like to enable such integration in the future.
+
+### New Features
+
+As discussed in the
+[SIG-micro Aug 12, 2020 meeting](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc),
+we are currently ***pausing*** accepting pull requests that add new features while
+we revisit some of the infrastructure to enable us to make this process easier
+and more scalable.
+
+Having said that, we still invite feature requests via
+[TF Lite Micro Github issues](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+to determine if the requested feature aligns with the TFLM roadmap.
diff --git a/tensorflow/lite/micro/kernels/circular_buffer.cc b/tensorflow/lite/micro/kernels/circular_buffer.cc
index b5a8ae1be3b..7f5aebaca2d 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -17,8 +17,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
-#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
diff --git a/tensorflow/lite/micro/kernels/concatenation.cc b/tensorflow/lite/micro/kernels/concatenation.cc
index f64362745be..636a7636a7b 100644
--- a/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/tensorflow/lite/micro/kernels/concatenation.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/micro/kernels/kernel_util.h b/tensorflow/lite/micro/kernels/kernel_util.h
index 530e52df5f5..b1b75046bfb 100644
--- a/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/tensorflow/lite/micro/kernels/kernel_util.h
@@ -64,7 +64,7 @@ const T* GetTensorData(const TfLiteEvalTensor* tensor) {
 
 // Returns the shape of a TfLiteEvalTensor struct.
 inline const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
-  if (tensor == nullptr) {
+  if (tensor == nullptr || tensor->dims == nullptr) {
     return RuntimeShape();
   }
   TfLiteIntArray* dims = tensor->dims;
diff --git a/tensorflow/lite/micro/kernels/l2norm.cc b/tensorflow/lite/micro/kernels/l2norm.cc
index f864efa271c..02fdfc0f39b 100644
--- a/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/tensorflow/lite/micro/kernels/l2norm.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/micro/kernels/pad.cc b/tensorflow/lite/micro/kernels/pad.cc
index 39f86cbf9a3..497632f22a0 100644
--- a/tensorflow/lite/micro/kernels/pad.cc
+++ b/tensorflow/lite/micro/kernels/pad.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 67d74574e61..0983a007011 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/type_to_tflitetype.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/micro/micro_string.cc b/tensorflow/lite/micro/micro_string.cc
index 6d6495ed7c9..95a0ae156ae 100644
--- a/tensorflow/lite/micro/micro_string.cc
+++ b/tensorflow/lite/micro/micro_string.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <cstdarg>
 #include <cstdint>
+#include <cstring>
 
 namespace {
 
@@ -125,7 +126,8 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
   const int32_t exponent_shift = 23;
   const int32_t exponent_bias = 127;
   const uint32_t fraction_mask = 0x007fffff;
-  const uint32_t u = *reinterpret_cast<uint32_t*>(&f);
+  uint32_t u;
+  memcpy(&u, &f, sizeof(int32_t));
   const int32_t exponent =
       ((u & exponent_mask) >> exponent_shift) - exponent_bias;
   const uint32_t fraction = (u & fraction_mask);
diff --git a/tensorflow/lite/micro/testing/test_linux_binary.sh b/tensorflow/lite/micro/testing/test_linux_binary.sh
index 1e967be1f61..30cf0413c4f 100755
--- a/tensorflow/lite/micro/testing/test_linux_binary.sh
+++ b/tensorflow/lite/micro/testing/test_linux_binary.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -e
+#!/bin/bash
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 62510159547..418da265f08 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -80,8 +80,8 @@ CC_WARNINGS := -Werror -Wsign-compare -Wdouble-promotion \
 # TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain.
 # TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact
 # this flag is for an optimized micro runtime.
-CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
-CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
+CXXFLAGS := -std=c++11 -Wstrict-aliasing -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
+CCFLAGS  := -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
 ARFLAGS := -r
 
 # override these in the makefile.inc for specific compiler targets
@@ -200,21 +200,22 @@ tensorflow/lite/kernels/internal/reference/tanh.h \
 tensorflow/lite/kernels/internal/cppmath.h \
 tensorflow/lite/kernels/internal/max.h \
 tensorflow/lite/kernels/internal/min.h \
+tensorflow/lite/kernels/internal/portable_tensor.h \
 tensorflow/lite/kernels/internal/strided_slice_logic.h \
-tensorflow/lite/kernels/internal/tensor.h \
 tensorflow/lite/kernels/internal/tensor_ctypes.h \
 tensorflow/lite/kernels/internal/types.h \
 tensorflow/lite/kernels/kernel_util.h \
 tensorflow/lite/kernels/op_macros.h \
 tensorflow/lite/kernels/padding.h \
+tensorflow/lite/portable_type_to_tflitetype.h \
 tensorflow/lite/schema/schema_generated.h \
-tensorflow/lite/string_type.h \
-tensorflow/lite/string_util.h \
-tensorflow/lite/type_to_tflitetype.h \
 tensorflow/lite/version.h
 
+# TODO(b/165940489): Figure out how to avoid including fixed point
+# platform-specific headers.
 THIRD_PARTY_CC_HDRS := \
 third_party/gemmlowp/fixedpoint/fixedpoint.h \
+third_party/gemmlowp/fixedpoint/fixedpoint_neon.h \
 third_party/gemmlowp/fixedpoint/fixedpoint_sse.h \
 third_party/gemmlowp/internal/detect_platform.h \
 third_party/gemmlowp/LICENSE \
diff --git a/tensorflow/lite/model_xnnpack_test.cc b/tensorflow/lite/model_xnnpack_test.cc
index 73860807c00..f04334c7711 100644
--- a/tensorflow/lite/model_xnnpack_test.cc
+++ b/tensorflow/lite/model_xnnpack_test.cc
@@ -30,7 +30,7 @@ TEST(FloatModel, WithXnnpackDelegate) {
 
   std::unique_ptr<Interpreter> interpreter;
   ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+                               ops::builtin::BuiltinOpResolver())(&interpreter),
             kTfLiteOk);
   ASSERT_TRUE(interpreter);
 
@@ -48,4 +48,32 @@ TEST(FloatModel, WithXnnpackDelegate) {
 #endif
 }
 
+TEST(FloatModel, DefaultXnnpackDelegateNotAllowed) {
+  // Note: this graph will be fully delegated by the XNNPACK delegate.
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/multi_add.bin");
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          *model, ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &interpreter),
+      kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+
+#if TFLITE_HAS_ATTRIBUTE_WEAK || defined(TFLITE_BUILD_WITH_XNNPACK_DELEGATE)
+  // As we don't allow applying xnnpack delegate by default, we will expect the
+  // following:
+  EXPECT_LT(1, interpreter->execution_plan().size());
+  int first_node_id = interpreter->execution_plan()[0];
+  const auto& first_node_reg =
+      interpreter->node_and_registration(first_node_id)->second;
+  const std::string op_name = GetOpNameByRegistration(first_node_reg);
+  EXPECT_EQ("ADD", op_name);
+#endif
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/portable_type_to_tflitetype.h b/tensorflow/lite/portable_type_to_tflitetype.h
new file mode 100644
index 00000000000..208efcce5b2
--- /dev/null
+++ b/tensorflow/lite/portable_type_to_tflitetype.h
@@ -0,0 +1,74 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
+#define TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
+
+// Most of the definitions have been moved to this subheader so that Micro
+// can include it without relying on <string>, which isn't available on all
+// platforms.
+
+// Arduino build defines abs as a macro here. That is invalid C++, and breaks
+// libc++'s <complex> header, undefine it.
+#ifdef abs
+#undef abs
+#endif
+
+#include <complex>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Map statically from a C++ type to a TfLiteType. Used in interpreter for
+// safe casts.
+// Example:
+//  typeToTfLiteType<bool>() -> kTfLiteBool
+template <typename T>
+constexpr TfLiteType typeToTfLiteType() {
+  return kTfLiteNoType;
+}
+// Map from TfLiteType to the corresponding C++ type.
+// Example:
+//   TfLiteTypeToType<kTfLiteBool>::Type -> bool
+template <TfLiteType TFLITE_TYPE_ENUM>
+struct TfLiteTypeToType {};  // Specializations below
+
+// Template specialization for both typeToTfLiteType and TfLiteTypeToType.
+#define MATCH_TYPE_AND_TFLITE_TYPE(CPP_TYPE, TFLITE_TYPE_ENUM) \
+  template <>                                                  \
+  constexpr TfLiteType typeToTfLiteType<CPP_TYPE>() {          \
+    return TFLITE_TYPE_ENUM;                                   \
+  }                                                            \
+  template <>                                                  \
+  struct TfLiteTypeToType<TFLITE_TYPE_ENUM> {                  \
+    using Type = CPP_TYPE;                                     \
+  }
+
+// No string mapping is included here, since the TF Lite packed representation
+// doesn't correspond to a C++ type well.
+MATCH_TYPE_AND_TFLITE_TYPE(int, kTfLiteInt32);
+MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
+MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
+MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
+MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(int8_t, kTfLiteInt8);
+MATCH_TYPE_AND_TFLITE_TYPE(bool, kTfLiteBool);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<float>, kTfLiteComplex64);
+MATCH_TYPE_AND_TFLITE_TYPE(std::complex<double>, kTfLiteComplex128);
+MATCH_TYPE_AND_TFLITE_TYPE(TfLiteFloat16, kTfLiteFloat16);
+MATCH_TYPE_AND_TFLITE_TYPE(double, kTfLiteFloat64);
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PORTABLE_TYPE_TO_TFLITETYPE_H_
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 12ee41d6dee..c7f86c6d6d8 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -185,8 +185,8 @@ class Interpreter(object):
           objects returned by lite.load_delegate().
       num_threads: Sets the number of threads used by the interpreter and
         available to CPU kernels. If not set, the interpreter will use an
-        implementation-dependent default number of threads. Currently,
-        only a subset of kernels, such as conv, support multi-threading.
+        implementation-dependent default number of threads. Currently, only a
+        subset of kernels, such as conv, support multi-threading.
 
     Raises:
       ValueError: If the interpreter was unable to create.
@@ -194,19 +194,33 @@ class Interpreter(object):
     if not hasattr(self, '_custom_op_registerers'):
       self._custom_op_registerers = []
     if model_path and not model_content:
+      custom_op_registerers_by_name = [
+          x for x in self._custom_op_registerers if isinstance(x, str)
+      ]
+      custom_op_registerers_by_func = [
+          x for x in self._custom_op_registerers if not isinstance(x, str)
+      ]
       self._interpreter = (
           _interpreter_wrapper.CreateWrapperFromFile(
-              model_path, self._custom_op_registerers))
+              model_path, custom_op_registerers_by_name,
+              custom_op_registerers_by_func))
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
     elif model_content and not model_path:
+      custom_op_registerers_by_name = [
+          x for x in self._custom_op_registerers if isinstance(x, str)
+      ]
+      custom_op_registerers_by_func = [
+          x for x in self._custom_op_registerers if not isinstance(x, str)
+      ]
       # Take a reference, so the pointer remains valid.
       # Since python strings are immutable then PyString_XX functions
       # will always return the same pointer.
       self._model_content = model_content
       self._interpreter = (
           _interpreter_wrapper.CreateWrapperFromBuffer(
-              model_content, self._custom_op_registerers))
+              model_content, custom_op_registerers_by_name,
+              custom_op_registerers_by_func))
     elif not model_content and not model_path:
       raise ValueError('`model_path` or `model_content` must be specified.')
     else:
@@ -528,25 +542,26 @@ class Interpreter(object):
     return self._interpreter.ResetVariableTensors()
 
   # Experimental and subject to change.
-  def _native_interpreter(self):
-    """Returns the underlying InterpreterWrapper object.
+  def _native_handle(self):
+    """Returns a pointer to the underlying tflite::Interpreter instance.
 
-    This allows users to extend tflite.Interpreter's functionality in custom cpp
-    function. For example,
-    at cpp level:
-      void SomeNewFeature(InterpreterWrapper* wrapper) {
-        // Get access to tflite::Interpreter
-        auto* interpreter = wrapper->interpreter();
-        // ...
-      }
-    at python level:
-      def some_new_feature(interpreter):
-        _cpp_to_py_wrapper.SomeNewFeature(interpreter._native_interpreter())
+    This allows extending tflite.Interpreter's functionality in a custom C++
+    function. Consider how that may work in a custom pybind wrapper:
+
+      m.def("SomeNewFeature", ([](py::object handle) {
+        auto* interpreter =
+          reinterpret_cast<tflite::Interpreter*>(handle.cast<intptr_t>());
+        ...
+      }))
+
+    and corresponding Python call:
+
+      SomeNewFeature(interpreter.native_handle())
 
     Note: This approach is fragile. Users must guarantee the C++ extension build
     is consistent with the tflite.Interpreter's underlying C++ build.
     """
-    return self._interpreter
+    return self._interpreter.interpreter()
 
 
 class InterpreterWithCustomOps(Interpreter):
@@ -573,8 +588,10 @@ class InterpreterWithCustomOps(Interpreter):
       experimental_delegates: Experimental. Subject to change. List of
         [TfLiteDelegate](https://www.tensorflow.org/lite/performance/delegates)
           objects returned by lite.load_delegate().
-      custom_op_registerers: List of str, symbol names of functions that take a
-        pointer to a MutableOpResolver and register a custom op.
+      custom_op_registerers: List of str (symbol names) or functions that take a
+        pointer to a MutableOpResolver and register a custom op. When passing
+        functions, use a pybind function that takes a uintptr_t that can be
+        recast as a pointer to a MutableOpResolver.
 
     Raises:
       ValueError: If the interpreter was unable to create.
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index cc74f4d8fbc..bcb338b84cf 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -42,7 +42,7 @@ from tensorflow.python.platform import test
 
 class InterpreterCustomOpsTest(test_util.TensorFlowTestCase):
 
-  def testRegisterer(self):
+  def testRegistererByName(self):
     interpreter = interpreter_wrapper.InterpreterWithCustomOps(
         model_path=resource_loader.get_path_to_datafile(
             'testdata/permute_float.tflite'),
@@ -50,6 +50,14 @@ class InterpreterCustomOpsTest(test_util.TensorFlowTestCase):
     self.assertTrue(interpreter._safe_to_run())
     self.assertEqual(test_registerer.get_num_test_registerer_calls(), 1)
 
+  def testRegistererByFunc(self):
+    interpreter = interpreter_wrapper.InterpreterWithCustomOps(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/permute_float.tflite'),
+        custom_op_registerers=[test_registerer.TF_TestRegisterer])
+    self.assertTrue(interpreter._safe_to_run())
+    self.assertEqual(test_registerer.get_num_test_registerer_calls(), 1)
+
   def testRegistererFailure(self):
     bogus_name = 'CompletelyBogusRegistererName'
     with self.assertRaisesRegex(
@@ -72,14 +80,16 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegex(ValueError, 'num_threads should >= 1'):
       interpreter_wrapper.Interpreter(
           model_path=resource_loader.get_path_to_datafile(
-              'testdata/permute_float.tflite'), num_threads=-1)
+              'testdata/permute_float.tflite'),
+          num_threads=-1)
 
   def testThreads_WrongType(self):
     with self.assertRaisesRegex(ValueError,
                                 'type of num_threads should be int'):
       interpreter_wrapper.Interpreter(
           model_path=resource_loader.get_path_to_datafile(
-              'testdata/permute_float.tflite'), num_threads=4.2)
+              'testdata/permute_float.tflite'),
+          num_threads=4.2)
 
   def testFloat(self):
     interpreter = interpreter_wrapper.Interpreter(
@@ -116,7 +126,8 @@ class InterpreterTest(test_util.TensorFlowTestCase):
   def testFloatWithTwoThreads(self):
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile(
-            'testdata/permute_float.tflite'), num_threads=2)
+            'testdata/permute_float.tflite'),
+        num_threads=2)
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
@@ -158,8 +169,7 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
-    interpreter.resize_tensor_input(input_details[0]['index'],
-                                    test_input.shape)
+    interpreter.resize_tensor_input(input_details[0]['index'], test_input.shape)
     interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
@@ -267,8 +277,7 @@ class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
   def testInvalidModelFile(self):
     with self.assertRaisesRegex(ValueError,
                                 'Could not open \'totally_invalid_file_name\''):
-      interpreter_wrapper.Interpreter(
-          model_path='totally_invalid_file_name')
+      interpreter_wrapper.Interpreter(model_path='totally_invalid_file_name')
 
   def testInvokeBeforeReady(self):
     interpreter = interpreter_wrapper.Interpreter(
@@ -423,16 +432,19 @@ class InterpreterDelegateTest(test_util.TensorFlowTestCase):
     self.skipTest('TODO(b/142136355): fix flakiness and re-enable')
     # Track which order destructions were doned in
     destructions = []
+
     def register_destruction(x):
       destructions.append(
           x if isinstance(x, str) else six.ensure_text(x, 'utf-8'))
       return 0
+
     # Make a wrapper for the callback so we can send this to ctypes
     delegate = interpreter_wrapper.load_delegate(self._delegate_file)
     # Make an interpreter with the delegate
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile(
-            'testdata/permute_float.tflite'), experimental_delegates=[delegate])
+            'testdata/permute_float.tflite'),
+        experimental_delegates=[delegate])
 
     class InterpreterDestroyCallback(object):
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 7295a46193e..adfa760f147 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <stdarg.h>
 
+#include <functional>
 #include <sstream>
 #include <string>
 
@@ -168,17 +169,22 @@ bool RegisterCustomOpByName(const char* registerer_name,
 InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
     std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model,
     std::unique_ptr<PythonErrorReporter> error_reporter,
-    const std::vector<std::string>& registerers, std::string* error_msg) {
+    const std::vector<std::string>& registerers_by_name,
+    const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+    std::string* error_msg) {
   if (!model) {
     *error_msg = error_reporter->message();
     return nullptr;
   }
 
   auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
-  for (const auto& registerer : registerers) {
+  for (const auto& registerer : registerers_by_name) {
     if (!RegisterCustomOpByName(registerer.c_str(), resolver.get(), error_msg))
       return nullptr;
   }
+  for (const auto& registerer : registerers_by_func) {
+    registerer(reinterpret_cast<uintptr_t>(resolver.get()));
+  }
   auto interpreter = CreateInterpreter(model.get(), *resolver);
   if (!interpreter) {
     *error_msg = error_reporter->message();
@@ -655,18 +661,27 @@ PyObject* InterpreterWrapper::tensor(PyObject* base_object, int i) {
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
-    const char* model_path, const std::vector<std::string>& registerers,
+    const char* model_path, const std::vector<std::string>& registerers_by_name,
+    const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg) {
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model =
       tflite_api_dispatcher::TfLiteModel::BuildFromFile(model_path,
                                                         error_reporter.get());
   return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
-                                  registerers, error_msg);
+                                  registerers_by_name, registerers_by_func,
+                                  error_msg);
+}
+
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
+    const char* model_path, const std::vector<std::string>& registerers,
+    std::string* error_msg) {
+  return CreateWrapperCPPFromFile(model_path, registerers, {}, error_msg);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    PyObject* data, const std::vector<std::string>& registerers,
+    PyObject* data, const std::vector<std::string>& registerers_by_name,
+    const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg) {
   char* buf = nullptr;
   Py_ssize_t length;
@@ -679,7 +694,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
       tflite_api_dispatcher::TfLiteModel::BuildFromBuffer(buf, length,
                                                           error_reporter.get());
   return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
-                                  registerers, error_msg);
+                                  registerers_by_name, registerers_by_func,
+                                  error_msg);
+}
+
+InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
+    PyObject* data, const std::vector<std::string>& registerers,
+    std::string* error_msg) {
+  return CreateWrapperCPPFromBuffer(data, registerers, {}, error_msg);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensors() {
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 5580eaa0f4b..6b83d2d06db 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 #define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -51,11 +52,20 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(
       const char* model_path, const std::vector<std::string>& registerers,
       std::string* error_msg);
+  static InterpreterWrapper* CreateWrapperCPPFromFile(
+      const char* model_path,
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
 
   // SWIG caller takes ownership of pointer.
   static InterpreterWrapper* CreateWrapperCPPFromBuffer(
       PyObject* data, const std::vector<std::string>& registerers,
       std::string* error_msg);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(
+      PyObject* data, const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
 
   ~InterpreterWrapper();
   PyObject* AllocateTensors();
@@ -106,7 +116,9 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateInterpreterWrapper(
       std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model,
       std::unique_ptr<PythonErrorReporter> error_reporter,
-      const std::vector<std::string>& registerers, std::string* error_msg);
+      const std::vector<std::string>& registerers_by_name,
+      const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
+      std::string* error_msg);
 
   InterpreterWrapper(
       std::unique_ptr<tflite_api_dispatcher::TfLiteModel> model,
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index a85bdc8baf4..61771ff62a4 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "pybind11/functional.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
@@ -42,6 +43,20 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           }
           return wrapper;
         });
+  m.def("CreateWrapperFromFile",
+        [](const std::string& model_path,
+           const std::vector<std::string>& registerers_by_name,
+           const std::vector<std::function<void(uintptr_t)>>&
+               registerers_by_func) {
+          std::string error;
+          auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromFile(
+              model_path.c_str(), registerers_by_name, registerers_by_func,
+              &error);
+          if (!wrapper) {
+            throw std::invalid_argument(error);
+          }
+          return wrapper;
+        });
   m.def("CreateWrapperFromBuffer",
         [](const py::bytes& data, const std::vector<std::string>& registerers) {
           std::string error;
@@ -52,6 +67,19 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           }
           return wrapper;
         });
+  m.def("CreateWrapperFromBuffer",
+        [](const py::bytes& data,
+           const std::vector<std::string>& registerers_by_name,
+           const std::vector<std::function<void(uintptr_t)>>&
+               registerers_by_func) {
+          std::string error;
+          auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromBuffer(
+              data.ptr(), registerers_by_name, registerers_by_func, &error);
+          if (!wrapper) {
+            throw std::invalid_argument(error);
+          }
+          return wrapper;
+        });
   py::class_<InterpreterWrapper>(m, "InterpreterWrapper")
       .def("AllocateTensors",
            [](InterpreterWrapper& self) {
@@ -153,5 +181,8 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           },
           R"pbdoc(
              ask the interpreter to set the number of threads to use.
-          )pbdoc");
+          )pbdoc")
+      .def("interpreter", [](InterpreterWrapper& self) {
+        return reinterpret_cast<intptr_t>(self.interpreter());
+      });
 }
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 56397110e5b..4a0ae9d4c9e 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -548,33 +548,7 @@ class TFLiteConverterBase(object):
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
-  """Converter subclass to share functionality between V2 converters.
-
-  Attributes:
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When False, any unknown operation is an error. When True, custom ops are
-      created for any op that is unknown. The developer needs to provide these
-      to the TensorFlow Lite runtime with a custom resolver. (default False)
-    optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
-    representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use the
-      dataset to evaluate different optimizations. Note that this is an optional
-      attribute but it is necessary if INT8 is the only support builtin ops in
-      target ops.
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    inference_input_type: Data type of the input layer. Note that integer types
-      (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
-    inference_output_type: Data type of the output layer. Note that integer
-      types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion. (default True)
-  """
+  """Converter subclass to share functionality between V2 converters."""
 
   def __init__(self):
     """Constructor for TFLiteConverter."""
@@ -962,12 +936,12 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       device.
     inference_input_type: Data type of the input layer. Note that integer types
       (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
+      integer quantization and quantization aware training. (default tf.float32,
+      must be in {tf.float32, tf.int8, tf.uint8})
     inference_output_type: Data type of the output layer. Note that integer
       types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
+      training integer quantization and quantization aware training. (default
+      tf.float32, must be in {tf.float32, tf.int8, tf.uint8})
     experimental_new_converter: Experimental flag, subject to change. Enables
       MLIR-based conversion instead of TOCO conversion. (default True)
 
@@ -1119,78 +1093,7 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
 
 class TFLiteConverterBaseV1(TFLiteConverterBase):
-  """Converter subclass to share functionality between V1 converters.
-
-  Attributes:
-    inference_type: Target data type of real-number arrays in the output file.
-      Must be `{tf.float32, tf.uint8}`. If `optimzations` are provided, this
-      parameter is ignored. (default tf.float32)
-    inference_input_type: Target data type of real-number input arrays. Allows
-      for a different type for input arrays. If an integer type is provided and
-      `optimizations` are not used, `quantized_input_stats` must be provided.
-      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
-      model from a quantization-aware trained input model, then
-      `inference_input_type` defaults to tf.uint8. In all other cases,
-      `inference_input_type` defaults to tf.float32. Must be `{tf.float32,
-      tf.uint8, tf.int8}`
-    inference_output_type: Target data type of real-number output arrays. Allows
-      for a different type for output arrays. If `inference_type` is tf.uint8,
-      signaling conversion to a fully quantized model from a quantization-aware
-      trained output model, then `inference_output_type` defaults to tf.uint8.
-      In all other cases, `inference_output_type` must be tf.float32, an error
-      will be thrown otherwise. Must be `{tf.float32, tf.uint8, tf.int8}`
-    output_format: Output file format. Currently must be `{TFLITE,
-      GRAPHVIZ_DOT}`. (default TFLITE)
-    quantized_input_stats: Dict of strings representing input tensor names
-      mapped to tuple of floats representing the mean and standard deviation
-      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
-        `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
-        (quantized_input_value - mean_value) / std_dev_value. (default {})
-    default_ranges_stats: Tuple of integers representing (min, max) range values
-      for all arrays without a specified range. Intended for experimenting with
-      quantization via "dummy quantization". (default None)
-    drop_control_dependency: Boolean indicating whether to drop control
-      dependencies silently. This is due to TFLite not supporting control
-      dependencies. (default True)
-    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
-      nodes in unexpected locations. Used when the location of the FakeQuant
-      nodes is preventing graph transformations necessary to convert the graph.
-      Results in a graph that differs from the quantized training graph,
-      potentially causing differing arithmetic behavior. (default False)
-    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
-      inputs and outputs of the concat operator for quantized models. Changes
-      the ranges of concat operator overlap when true. (default False)
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When false any unknown operation is an error. When true, custom ops are
-      created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver. (default
-      False)
-    post_training_quantize: Deprecated. Please specify `[Optimize.DEFAULT]` for
-      `optimizations` instead. Boolean indicating whether to quantize the
-      weights of the converted float model.  Model size will be reduced and
-      there will be latency improvements (at the cost of accuracy). (default
-      False)
-    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
-      stages of processing GraphViz .dot files. Preferred over
-      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
-      output file. (default None)
-    dump_graphviz_video: Boolean indicating whether to dump the graph after
-      every graph transformation. (default False)
-    conversion_summary_dir: A string indicating the path to the generated
-      conversion logs.
-    target_ops: Deprecated. Please specify `target_spec.supported_ops` instead.
-      Set of OpsSet options indicating which converter to use. (default
-      set([OpsSet.TFLITE_BUILTINS]))
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
-    representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use the
-      dataset to evaluate different optimizations.
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion. (default True)
-  """
+  """Converter subclass to share functionality between V1 converters."""
 
   def __init__(self, experimental_debug_info_func):
     """Constructor for TFLiteConverter.
@@ -1752,24 +1655,26 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
 
     ```python
     # Converting a GraphDef from session.
-    converter = tf.compat.v1.TFLiteConverter.from_session(
+    converter = tf.compat.v1.lite.TFLiteConverter.from_session(
       sess, in_tensors, out_tensors)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a GraphDef from file.
-    converter = tf.compat.v1.TFLiteConverter.from_frozen_graph(
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
       graph_def_file, input_arrays, output_arrays)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a SavedModel.
-    converter = tf.compat.v1.TFLiteConverter.from_saved_model(saved_model_dir)
+    converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(
+        saved_model_dir)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a tf.keras model.
-    converter = tf.compat.v1.TFLiteConverter.from_keras_model_file(keras_model)
+    converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(
+        keras_model)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
     ```
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index a4691b70e49..c5013edb179 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -4742,11 +4742,11 @@ flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers
 
 struct AddOptionsT : public flatbuffers::NativeTable {
   typedef AddOptions TableType;
-  bool pot_scale_int16;
   tflite::ActivationFunctionType fused_activation_function;
+  bool pot_scale_int16;
   AddOptionsT()
-      : pot_scale_int16(true),
-        fused_activation_function(tflite::ActivationFunctionType_NONE) {
+      : fused_activation_function(tflite::ActivationFunctionType_NONE),
+        pot_scale_int16(true) {
   }
 };
 
@@ -4756,16 +4756,16 @@ struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_POT_SCALE_INT16 = 6
   };
-  bool pot_scale_int16() const {
-    return GetField<uint8_t>(VT_POT_SCALE_INT16, 0) != 0;
-  }
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            verifier.EndTable();
   }
   AddOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4779,6 +4779,9 @@ struct AddOptionsBuilder {
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(AddOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
   explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4793,8 +4796,10 @@ struct AddOptionsBuilder {
 
 inline flatbuffers::Offset<AddOptions> CreateAddOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
   AddOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
@@ -5914,11 +5919,11 @@ flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::
 
 struct SubOptionsT : public flatbuffers::NativeTable {
   typedef SubOptions TableType;
-  bool pot_scale_int16;
   tflite::ActivationFunctionType fused_activation_function;
+  bool pot_scale_int16;
   SubOptionsT()
-      : pot_scale_int16(true),
-        fused_activation_function(tflite::ActivationFunctionType_NONE) {
+      : fused_activation_function(tflite::ActivationFunctionType_NONE),
+        pot_scale_int16(true) {
   }
 };
 
@@ -5928,16 +5933,16 @@ struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_POT_SCALE_INT16 = 6
   };
-  bool pot_scale_int16() const {
-    return GetField<uint8_t>(VT_POT_SCALE_INT16, 0) != 0;
-  }
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            verifier.EndTable();
   }
   SubOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -5951,6 +5956,9 @@ struct SubOptionsBuilder {
   void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
     fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(SubOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
   explicit SubOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -5965,8 +5973,10 @@ struct SubOptionsBuilder {
 
 inline flatbuffers::Offset<SubOptions> CreateSubOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE,
+    bool pot_scale_int16 = true) {
   SubOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
@@ -11405,6 +11415,7 @@ inline void AddOptions::UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_fu
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
 }
 
 inline flatbuffers::Offset<AddOptions> AddOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -11416,9 +11427,11 @@ inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferB
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
   return tflite::CreateAddOptions(
       _fbb,
-      _fused_activation_function);
+      _fused_activation_function,
+      _pot_scale_int16);
 }
 
 inline MulOptionsT *MulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -11921,6 +11934,7 @@ inline void SubOptions::UnPackTo(SubOptionsT *_o, const flatbuffers::resolver_fu
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
+  { auto _e = pot_scale_int16(); _o->pot_scale_int16 = _e; }
 }
 
 inline flatbuffers::Offset<SubOptions> SubOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -11932,9 +11946,11 @@ inline flatbuffers::Offset<SubOptions> CreateSubOptions(flatbuffers::FlatBufferB
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
+  auto _pot_scale_int16 = _o->pot_scale_int16;
   return tflite::CreateSubOptions(
       _fbb,
-      _fused_activation_function);
+      _fused_activation_function,
+      _pot_scale_int16);
 }
 
 inline DivOptionsT *DivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 2086f9badbf..b8f3fcd3b9f 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -16,8 +16,9 @@ limitations under the License.
 // Util methods to read and write String tensors.
 // String tensors are considered to be char tensor with protocol.
 //   [0, 3] 4 bytes: N, num of strings in the tensor in little endian.
-//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian.
-//   [(N+2)*4, (N+2)*4+3] 4 bytes: length of the whole char buffer.
+//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian,
+//                                 for i from 0 to N-1.
+//   [(N+1)*4, (N+1)*4+3] 4 bytes: length of the whole char buffer.
 //   [offset(i), offset(i+1) - 1] : content of i-th string.
 // Example of a string tensor:
 // [
@@ -76,9 +77,6 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
-  // String tensors are not generally supported on platforms w/ static memory.
-  // TODO(b/156130024): Remove this guard after removing header from TFLM deps.
-#ifndef TF_LITE_STATIC_MEMORY
   // Fill content into a string tensor, with the given new_shape. The new shape
   // must match the number of strings in this object. Caller relinquishes
   // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
@@ -87,7 +85,6 @@ class DynamicBuffer {
 
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
-#endif  // TF_LITE_STATIC_MEMORY
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 4bfc17dc509..02cd86b61f0 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -35,9 +35,12 @@ exports_files([
     name = "zip_test_%s" % test_name,
     size = "medium",
     srcs = ["generated_examples_zip_test.cc"],
-    additional_test_args = {
-        # TODO(b/162696268): uncomment once the bug is fixed.
-        # "xnnpack": ["--use_xnnpack=true"],
+    additional_test_tags_args = {
+        "xnnpack": (
+            # TODO(b/162696268): remove 'notap' once the bug is fixed.
+            ["notap"],
+            ["--use_xnnpack=true"],
+        ),
     },
     conversion_mode = conversion_mode,
     data = [
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index 71a1a31ac4c..d9cd6883a8d 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -83,7 +83,8 @@ def _convert(converter, **kwargs):
   Args:
     converter: TFLiteConverter object.
     **kwargs: Additional arguments to be passed into the converter. Supported
-      flags are {"target_ops", "post_training_quantize", "quantize_to_float16"}.
+      flags are {"target_ops", "post_training_quantize",
+      "quantize_to_float16", "post_training_quantize_16x8", "model_input_size"}.
 
   Returns:
     The converted TFLite model in serialized format.
@@ -97,9 +98,58 @@ def _convert(converter, **kwargs):
     converter.optimizations = [_lite.Optimize.DEFAULT]
   if kwargs.get("quantize_to_float16", False):
     converter.target_spec.supported_types = [constants.FLOAT16]
+  if kwargs.get("post_training_quantize_16x8", False):
+    input_size = kwargs.get("model_input_size")
+
+    def _get_calib_data_func():
+
+      def representative_data_gen():
+        num_calibration = 20
+        for _ in range(num_calibration):
+          yield [
+              np.random.rand(
+                  1,
+                  input_size[0],
+                  input_size[1],
+                  input_size[2],
+              ).astype(np.float32)
+          ]
+
+      return representative_data_gen
+
+    converter.optimizations = [_lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = \
+      [_lite.OpsSet.\
+        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]
+    converter.representative_dataset = _get_calib_data_func()
   return converter.convert()
 
 
+def _check_model_quantized_to_16x8(tflite_model):
+  """Checks that the activations are quantized into int16.
+
+    Args:
+      tflite_model: Serialized TensorFlow Lite model.
+
+    Raises:
+      ValueError: Activations with int16 type are not found.
+  """
+  interpreter = _get_tflite_interpreter(tflite_model)
+  interpreter.allocate_tensors()
+  all_tensor_details = interpreter.get_tensor_details()
+
+  found_input = False
+  for tensor in all_tensor_details:
+    if "_int16" in tensor["name"]:
+      found_input = True
+      if tensor["dtype"] is not np.int16:
+        raise ValueError("Activations should be int16.")
+
+  # Check that we found activations in the correct type: int16
+  if not found_input:
+    raise ValueError("Could not find int16 activations.")
+
+
 def _get_tflite_interpreter(tflite_model, input_shapes_resize=None):
   """Creates a TFLite interpreter with resized input tensors.
 
@@ -447,6 +497,7 @@ def test_frozen_graph_quant(filename,
   # unless we are quantizing to float16.
   if ("target_ops" in kwargs and
       not kwargs.get("quantize_to_float16", False) and
+      not kwargs.get("post_training_quantize_16x8", False) and
       set(kwargs["target_ops"]) == set([_lite.OpsSet.SELECT_TF_OPS])):
     if has_quant_tensor:
       raise ValueError("--post_training_quantize flag unexpectedly altered the "
@@ -537,12 +588,20 @@ def test_saved_model(directory,
       signature_key=signature_key)
   tflite_model = _convert(converter, **kwargs)
 
+  # 5 decimal places by default
+  tolerance = 5
+  if kwargs.get("post_training_quantize_16x8", False):
+    _check_model_quantized_to_16x8(tflite_model)
+    # only 2 decimal places for full quantization
+    tolerance = 2
+
   tf_eval_func = evaluate_saved_model(directory, tag_set, signature_key)
   compare_models(
       tflite_model,
       tf_eval_func,
       input_data=input_data,
-      input_data_range=input_data_range)
+      input_data_range=input_data_range,
+      tolerance=tolerance)
 
 
 def test_saved_model_v2(directory,
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 03a0004b2fc..2733363fc3a 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -156,6 +156,39 @@ class EvaluateSavedModel(test.TestCase):
         saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
     model_coverage.test_saved_model(saved_model_dir)
 
+  def testPostTrainingQuantize16x8(self):
+    """Test for post-training quantization mode: activations/weights - int16/int8."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
+
+    input_size = [5, 5, 3]
+    kernel_size = [3, 3, 1]
+    layer_name = 'test_conv2d'
+    input_0 = keras.layers.Input(shape=input_size)
+    layer_0 = keras.layers.Conv2D(
+        filters=kernel_size[-1],
+        kernel_size=kernel_size[0:2],
+        use_bias=False,
+        name=layer_name)(
+            input_0)
+    model = keras.models.Model(inputs=[input_0], outputs=[layer_0])
+    keras_layer = [layer for layer in model.layers if layer.name == layer_name
+                  ][0]
+    keras_layer.set_weights([
+        np.random.rand(
+            input_size[-1],
+            kernel_size[0],
+            kernel_size[1],
+            kernel_size[2],
+        ).astype(np.float32)
+    ])
+
+    saved_model.save(model, saved_model_dir)
+
+    model_coverage.test_saved_model(
+        saved_model_dir,
+        post_training_quantize_16x8=True,
+        model_input_size=input_size)
+
 
 class EvaluateKerasModel(test.TestCase):
 
diff --git a/tensorflow/lite/testing/op_tests/range.py b/tensorflow/lite/testing/op_tests/range.py
index ad3d2dfc252..d78742f08fc 100644
--- a/tensorflow/lite/testing/op_tests/range.py
+++ b/tensorflow/lite/testing/op_tests/range.py
@@ -29,7 +29,7 @@ def make_range_tests(options):
 
   test_parameters = [{
       "dtype": [tf.int32, tf.float32],
-      "offset": [10, 100, 1000],
+      "offset": [10, 100, 1000, 0],
       "delta": [1, 2, 3, 4, -1, -2, -3, -4],
   }]
 
diff --git a/tensorflow/lite/testing/op_tests/rfft2d.py b/tensorflow/lite/testing/op_tests/rfft2d.py
index 1e4ea42d879..e7525f13896 100644
--- a/tensorflow/lite/testing/op_tests/rfft2d.py
+++ b/tensorflow/lite/testing/op_tests/rfft2d.py
@@ -30,9 +30,10 @@ def make_rfft2d_tests(options):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape": [[8, 8], [3, 8, 8]],
+      "input_shape": [[8, 8], [3, 8, 8], [3, 1, 16]],
       "fft_length": [
-          None, [4, 4], [4, 8], [8, 4], [8, 8], [8, 16], [16, 8], [16, 16]
+          None, [4, 4], [4, 8], [8, 4], [8, 8], [8, 16], [16, 8], [16, 16],
+          [1, 8], [1, 16]
       ]
   }]
 
diff --git a/tensorflow/lite/testing/op_tests/where.py b/tensorflow/lite/testing/op_tests/where.py
index 49802422e3f..90db8d56f25 100644
--- a/tensorflow/lite/testing/op_tests/where.py
+++ b/tensorflow/lite/testing/op_tests/where.py
@@ -33,6 +33,11 @@ def make_where_tests(options):
           "input_shape_set": [([1, 2, 3, 4], [1, 2, 3, 4]),],
           "use_where_v2": [False, True],
       },
+      {
+          "input_dtype": [tf.float32, tf.int32],
+          "input_shape_set": [([], []),],
+          "use_where_v2": [],
+      },
   ]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/zip_test_utils.py b/tensorflow/lite/testing/zip_test_utils.py
index f20361ccc71..0340886d37d 100644
--- a/tensorflow/lite/testing/zip_test_utils.py
+++ b/tensorflow/lite/testing/zip_test_utils.py
@@ -162,7 +162,8 @@ def format_result(t):
     values = ["{:.9f}".format(value) for value in list(t.flatten())]
     return ",".join(values)
   else:
-    return _pywrap_string_util.SerializeAsHexString(t.flatten())
+    # SerializeAsHexString returns bytes in PY3, so decode if appropriate.
+    return _pywrap_string_util.SerializeAsHexString(t.flatten()).decode("utf-8")
 
 
 def write_examples(fp, examples):
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 68cc59dd371..453ea5b986a 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -1,4 +1,4 @@
-# TFLite Model Benchmark Tool
+# TFLite Model Benchmark Tool with C++ Binary
 
 ## Description
 
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index f73939c96bf..3475d47632a 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -1,12 +1,12 @@
-# TFLite Android Model Benchmark Tool
+# TFLite Model Benchmark Tool with Android Apk
 
 ## Description
 
 This Android benchmark app is a simple wrapper around the TensorFlow Lite
 [command-line benchmark utility](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
 
-Pushing and executing binaries directly on Android is a valid approach to
-benchmarking, but it can result in subtle (but observable) differences in
+Pushing and executing binaries directly on an Android device is a valid approach
+to benchmarking, but it can result in subtle (but observable) differences in
 performance relative to execution within an actual Android app. In particular,
 Android's scheduler tailors behavior based on thread and process priorities,
 which differ between a foreground Activity/Application and a regular background
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 23eb528f4c9..d320a90d005 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -47,7 +47,8 @@ extern "C" {
 typedef enum TfLiteStatus {
   kTfLiteOk = 0,
   kTfLiteError = 1,
-  kTfLiteDelegateError = 2
+  kTfLiteDelegateError = 2,
+  kTfLiteApplicationError = 3
 } TfLiteStatus;
 
 // The list of external context types known to TF Lite. This list exists solely
@@ -88,7 +89,7 @@ typedef struct TfLiteIntArray {
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
 #if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
      __GNUC_MINOR__ >= 1) ||                                      \
-    defined(HEXAGON)
+    defined(HEXAGON) || (__clang_major__ == 7 && __clang_minor__ == 1)
   int data[0];
 #else
   int data[];
diff --git a/tensorflow/lite/tools/cmake/README.md b/tensorflow/lite/tools/cmake/README.md
new file mode 100644
index 00000000000..7624b6623c2
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/README.md
@@ -0,0 +1,50 @@
+# Build TensorFlow Lite with CMake
+
+This page describes how to build the TensorFlow Lite static library with CMake
+tool.
+
+The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
+and TensorFlow devel docker image
+[tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+
+**Note:** This is an experimental that is subject to change.
+
+**Note:** The following are not currently supported: Android, iOS, Tests and
+Host Tools (i.e benchmark / analysis tools etc.)
+
+#### Step 1. Install CMake tool
+
+It requires CMake 3.16 or higher. On Ubunutu, you can simply run the following
+command.
+
+```sh
+sudo apt-get install cmake
+```
+
+Or you can follow [the offcial cmake installation guide](https://cmake.org/install/)
+
+#### Step 2. Clone TensorFlow repository
+
+```sh
+git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+```
+
+**Note:** If you're using the TensorFlow Docker image, the repo is already
+provided in `/tensorflow_src/`.
+
+#### Step 3. Create CMake build directory and run CMake tool
+
+```sh
+mkdir tflite_build
+cd tflite_build
+cmake ../tensorflow_src/tensorflow/lite
+```
+
+#### Step 4. Build TensorFlow Lite
+
+```sh
+cmake --build . -j
+```
+
+**Note:** This should compile a static library `libtensorflow-lite.a` in the
+current directory.
diff --git a/tensorflow/lite/tools/cmake/modules/Findeigen.cmake b/tensorflow/lite/tools/cmake/modules/Findeigen.cmake
new file mode 100644
index 00000000000..1ffb54790fa
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findeigen.cmake
@@ -0,0 +1,24 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(eigen)
+if(eigen_POPULATED)
+  set(EIGEN_FOUND TRUE)
+  get_target_property(EIGEN_INCLUDE_DIRS eigen INTERFACE_DIRECTORIES)
+  set(EIGEN_LIBRARIES Eigen3::Eigen)
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake b/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
new file mode 100644
index 00000000000..1b0dc28f624
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
@@ -0,0 +1,25 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(farmhash)
+if(farmhash_POPULATED)
+  set(FARMHASH_FOUND TRUE)
+  get_target_property(FARMHASH_INCLUDE_DIRS farmhash INTERFACE_DIRECTORIES)
+  add_library(farmhash::farmhash ALIAS farmhash)
+  set(FARMHASH_LIBRARIES farmhash::farmhash)
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findfft2d.cmake b/tensorflow/lite/tools/cmake/modules/Findfft2d.cmake
new file mode 100644
index 00000000000..0d074323ed0
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findfft2d.cmake
@@ -0,0 +1,37 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(fft2d)
+if(fft2d_POPULATED)
+  set(FFT2D_FOUND TRUE CACHE BOOL "Found FF2D")
+  get_target_property(FFT2D_INCLUDE_DIRS fft2d INCLUDE_DIRECTORIES)
+  set(FFT2D_INCLUDE_DIRS ${FFT2D_INCLUDE_DIRS} CACHE STRING
+    "FFT2D include dirs"
+  )
+  set(FFT2D_LIBRARIES
+    fft2d_alloc
+    fft2d_fft4f2d
+    fft2d_fftsg
+    fft2d_fftsg2d
+    fft2d_fftsg3d
+    fft2d_shrtdct
+    CACHE
+    STRING
+    "FFT2D libraries"
+  )
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findflatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/Findflatbuffers.cmake
new file mode 100644
index 00000000000..feb447b133f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findflatbuffers.cmake
@@ -0,0 +1,27 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(flatbuffers)
+if(flatbuffers_POPULATED)
+  set(FLATBUFFERS_FOUND TRUE)
+  get_target_property(FLATBUFFERS_INCLUDE_DIRS flatbuffers INCLUDE_DIRECTORIES)
+  set(FLATBUFFERS_LIBRARIES flatbuffers)
+  set(FLATBUFFERS_PROJECT_DIR "${flatbuffers_SOURCE_DIR}" CACHE STRING
+    "Flatbuffers project dir"
+  )
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findgemmlowp.cmake b/tensorflow/lite/tools/cmake/modules/Findgemmlowp.cmake
new file mode 100644
index 00000000000..70331ad0a69
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findgemmlowp.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(gemmlowp)
+if(gemmlowp_POPULATED)
+  set(GEMMLOWP_FOUND TRUE)
+  get_target_property(GEMMLOWP_INCLUDE_DIRS gemmlowp INTERFACE_DIRECTORIES)
+  set(GEMMLOWP_LIBRARIES
+    gemmlowp
+    gemmlowp_fixedpoint
+    gemmlowp_profiler
+    gemmlowp_eight_bit_int_gemm
+  )
+endif()
+
diff --git a/tensorflow/lite/tools/cmake/modules/Findneon2sse.cmake b/tensorflow/lite/tools/cmake/modules/Findneon2sse.cmake
new file mode 100644
index 00000000000..83543852c87
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findneon2sse.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tensorflow-lite uses find_package for this package, so override the system
+# installation and build from source instead.
+include(neon2sse)
+if(neon2sse_POPULATED)
+  set(NEON2SSE_FOUND TRUE)
+  get_target_property(NEON2SSE_INCLUDE_DIRS NEON_2_SSE INTERFACE_DIRECTORIES)
+  set(NEON2SSE_LIBRARIES NEON_2_SSE)
+endif()
diff --git a/tensorflow/lite/tools/cmake/modules/Findruy.cmake b/tensorflow/lite/tools/cmake/modules/Findruy.cmake
new file mode 100644
index 00000000000..e1517eebb04
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findruy.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ruy)
diff --git a/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake b/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
new file mode 100644
index 00000000000..9ed95109ba9
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
@@ -0,0 +1,583 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(FetchContent)
+
+# Pairs of regex --> replacement strings that map Git repositories to archive
+# URLs. GIT_COMMIT is replaced with the hash of the commit.
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_MATCH
+  "^https?://github.com/([^/]+)/([^/.]+)(\\.git)?\$"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_REPLACE
+  "https://github.com/\\1/\\2/archive/GIT_COMMIT.zip"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_MATCH
+  "^https?://gitlab.com/([^/]+)/([^/.]+)(\\.git)?"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_REPLACE
+  "https://gitlab.com/\\1/\\2/-/archive/GIT_COMMIT/\\2-GIT_COMMIT.tar.gz"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_MATCH
+  "^(https?://[^.]+\\.googlesource\\.com/.*)"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_REPLACE
+  "\\1/+archive/GIT_COMMIT.tar.gz"
+)
+# List of prefixes for regex match and replacement variables that map Git
+# repositories to archive URLs.
+list(APPEND OVERRIDABLE_FETCH_CONTENT_GIT_TRANSFORMS
+  OVERRIDABLE_FETCH_CONTENT_GITHUB
+  OVERRIDABLE_FETCH_CONTENT_GITLAB
+  OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE
+)
+
+# Pairs of regex --> replacement strings that map Git repositories to raw file
+# URLs.
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_FILE_MATCH
+  "${OVERRIDABLE_FETCH_CONTENT_GITHUB_MATCH}"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITHUB_FILE_REPLACE
+  "https://raw.githubusercontent.com/\\1/\\2/GIT_COMMIT/FILE_PATH"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_FILE_MATCH
+  "${OVERRIDABLE_FETCH_CONTENT_GITLAB_MATCH}"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GITLAB_FILE_REPLACE
+  "https://gitlab.com/\\1/\\2/-/raw/GIT_COMMIT/FILE_PATH"
+)
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_FILE_MATCH
+  "${OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_MATCH}"
+)
+# This isn't the raw file, gitiles doesn't support raw file download without
+# decoding the file from base64.
+set(OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_FILE_REPLACE
+  "\\1/+/GIT_COMMIT/FILE_PATH"
+)
+
+# List of prefixes for regex match and replacement variables that map Git
+# repositories to archive URLs.
+list(APPEND OVERRIDABLE_FETCH_CONTENT_GIT_FILE_TRANSFORMS
+  OVERRIDABLE_FETCH_CONTENT_GITHUB_FILE
+  OVERRIDABLE_FETCH_CONTENT_GITLAB_FILE
+  OVERRIDABLE_FETCH_CONTENT_GOOGLESOURCE_FILE
+)
+
+# Try applying replacements to string.
+#
+# TRANSFORMS: List which contains prefixes for  _MATCH / _REPLACE replacements
+# to try. For example, given the list "FOO" this will try to apply a regex
+# replacement with the value of FOO_MATCH and FOO_REPLACE.
+# TO_REPLACE: String to apply replacements to.
+# OUTPUT_VAR: Name of the variable to store the URL if successful. If
+# conversion fails this variable will be empty.
+function(_ApplyReplacements TRANSFORMS TO_REPLACE OUTPUT_VAR)
+  foreach(PREFIX ${TRANSFORMS})
+    message(VERBOSE "Try converting ${GIT_REPOSITORY} with ${${PREFIX}_MATCH}")
+    set(MATCH "${${PREFIX}_MATCH}")
+    set(REPLACE "${${PREFIX}_REPLACE}")
+    if(MATCH AND REPLACE)
+      string(REGEX REPLACE
+        "${MATCH}"
+        "${REPLACE}"
+        REPLACED
+        "${TO_REPLACE}"
+      )
+      if(NOT "${REPLACED}" STREQUAL "${TO_REPLACE}")
+        set(${OUTPUT_VAR} "${REPLACED}" PARENT_SCOPE)
+      endif()
+    endif()
+  endforeach()
+endfunction()
+
+
+# Try to convert a Git repository to an archive URL.
+#
+# GIT_REPOSITORY: Repository URL to convert.
+# GIT_COMMIT: Commit hash or tag to convert.
+# REPORT_WARNING: Whether to report a warning if conversion fails.
+# OUTPUT_VAR: Name of the variable to store the URL if successful. If
+# conversion fails this variable will be empty.
+function(_GitRepoArchiveUrl GIT_REPOSITORY GIT_COMMIT REPORT_WARNING OUTPUT_VAR)
+  list(REMOVE_DUPLICATES OVERRIDABLE_FETCH_CONTENT_GIT_TRANSFORMS)
+  _ApplyReplacements(
+    "${OVERRIDABLE_FETCH_CONTENT_GIT_TRANSFORMS}"
+    "${GIT_REPOSITORY}"
+    REPLACED
+  )
+  if(REPLACED)
+    string(REPLACE "GIT_COMMIT" "${GIT_COMMIT}" WITH_COMMIT "${REPLACED}")
+    message(VERBOSE "${GIT_REPOSITORY} / ${GIT_COMMIT} --> ${WITH_COMMIT}")
+    set(${OUTPUT_VAR} "${WITH_COMMIT}" PARENT_SCOPE)
+  elseif(REPORT_WARNING)
+    message(WARNING
+      "Unable to map ${GIT_REPOSITORY} / ${GIT_COMMIT} to an archive URL"
+    )
+  endif()
+endfunction()
+
+
+# Try to convert a Git repository, commit and relative path to a link to the
+# file.
+#
+# GIT_REPOSITORY: Repository URL to convert.
+# GIT_COMMIT: Commit hash or tag to convert.
+# FILE_PATH: Path to the file.
+# OUTPUT_VAR: Name of the variable to store the URL if successful. If
+# conversion fails this variable will be empty.
+function(_GitRepoFileUrl GIT_REPOSITORY GIT_COMMIT FILE_PATH OUTPUT_VAR)
+  list(REMOVE_DUPLICATES OVERRIDABLE_FETCH_CONTENT_GIT_FILE_TRANSFORMS)
+  _ApplyReplacements(
+    "${OVERRIDABLE_FETCH_CONTENT_GIT_FILE_TRANSFORMS}"
+    "${GIT_REPOSITORY}"
+    REPLACED
+  )
+  if(REPLACED)
+    string(REPLACE "GIT_COMMIT" "${GIT_COMMIT}" WITH_COMMIT "${REPLACED}")
+    string(REPLACE "FILE_PATH" "${FILE_PATH}" WITH_FILE "${WITH_COMMIT}")
+    message(VERBOSE
+      "${GIT_REPOSITORY} / ${GIT_COMMIT} / ${FILE_PATH} --> ${WITH_FILE}"
+    )
+    set(${OUTPUT_VAR} "${WITH_FILE}" PARENT_SCOPE)
+  else()
+    message(WARNING
+      "Unable to map ${GIT_REPOSITORY} / ${GIT_COMMIT} / ${FILE_PATH} to a URL"
+    )
+  endif()
+endfunction()
+
+
+# Try to determine the license URL from a path within the content and
+# cache LICENSE_FILE and LICENSE_URL properties.
+#
+# CONTENT_NAME: Name of the content that hosts the license.
+# LICENSE_FILE: Relative path in the archive.
+# OUTPUT_VAR: Name of variable to store / retrieve the license URL.
+function(_LicenseFileToUrl CONTENT_NAME LICENSE_FILE OUTPUT_VAR)
+  foreach(PROPERTY GIT_REPOSITORY GIT_COMMIT LICENSE_URL)
+    _OverridableFetchContent_GetProperty(
+      "${CONTENT_NAME}"
+      "${PROPERTY}"
+      "${PROPERTY}"
+    )
+  endforeach()
+  _OverridableFetchContent_SetProperty(
+    "${CONTENT_NAME}"
+    LICENSE_FILE
+    "License for ${CONTENT_NAME}"
+    "${LICENSE_FILE}"
+  )
+  if(NOT LICENSE_URL)
+    if(GIT_REPOSITORY AND GIT_COMMIT)
+      # Try to synthesize the license URL from the repo path.
+      _GitRepoFileUrl(
+        "${GIT_REPOSITORY}"
+        "${GIT_COMMIT}"
+        "${LICENSE_FILE}"
+        LICENSE_URL
+      )
+      if(LICENSE_URL)
+        _OverridableFetchContent_SetProperty(
+          "${CONTENT_NAME}"
+          LICENSE_URL
+          "License URL for ${CONTENT_NAME}"
+          "${LICENSE_URL}"
+        )
+        set(${OUTPUT_VAR} "${LICENSE_URL}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+
+# Replacement for FetchContent_Declare() that allows the user to override the
+# download URL for Git and URL sources and also favor fetching via URL vs.
+# a Git repo using variables external to this method.
+#
+# See FetchContent_Declare() and ExternalProject_Add() for the arguments
+# supported by this method.
+#
+# In addition to FetchContent_Declare() and ExternalProject_Add() arguments,
+# this method supports LICENSE_FILE that enables the caller to specify the
+# relative path of the license in the downloaded archive which disables the
+# search for a license in OverridableFetchContent_Populate().
+# LICENSE_URL can be specified to override the URL of the LICENSE_FILE if
+# a direct link to the URL can't be formed from the download path.
+#
+# It's possible to override, GIT_REPOSITORY, GIT_TAG, URL and URL_HASH for
+# a target by setting
+# OVERRIDABLE_FETCH_CONTENT_<contentName>_<variable> where <contentName> is the
+# CONTENT_NAME argument content provided to this method and <variable> is the
+# argument of this method to override. For example, given CONTENT_NAME = foo
+# the GIT_REPOSITORY can be overridden by setting foo_GIT_REPOSITORY to the
+# value to use instead.
+#
+# To convert a GIT_REPOSITORY / GIT_TAG reference to a URL,
+# set OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL_<contentName>
+# to ON for one repository or
+# OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL to ON for all
+# repositories. This will, where possible, convert a GIT_REPOSITORY / GIT_TAG
+# reference to a URL to download instead which is much faster to copy than
+# cloning a git repo.
+#
+# If OVERRIDABLE_FETCH_CONTENT_USE_GIT is ON, when a GIT_REPOSITORY and a
+# download URL are specified this method will clone the GIT_REPOSITORY. When
+# OVERRIDABLE_FETCH_CONTENT_USE_GIT is OFF or not set and both GIT_REPOSITORY
+# and download URL are specified the download URL is used instead.
+#
+# To override the archive URL before it's passed to FetchContent_Declare()
+# set OVERRIDABLE_FETCH_CONTENT_<contentName>_MATCH to a regular expression
+# to match the archive URL and OVERRIDABLE_FETCH_CONTENT_<contentName>_REPLACE
+# with the string to replace the archive URL.
+#
+# All content names passed to this method are added to the global property
+# OVERRIDABLE_FETCH_CONTENT_LIST.
+function(OverridableFetchContent_Declare CONTENT_NAME)
+  set(OVERRIDABLE_ARGS
+    GIT_REPOSITORY
+    GIT_TAG
+    URL
+    URL_HASH
+    URL_MD5
+  )
+  set(ALL_VALUE_ARGS LICENSE_FILE LICENSE_URL ${OVERRIDABLE_ARGS})
+  cmake_parse_arguments(ARGS
+    ""
+    "${ALL_VALUE_ARGS}"
+    ""
+    ${ARGN}
+  )
+  # Optionally override parsed arguments with values from variables in the form
+  # ${CONTENT_NAME}_${OVERRIDABLE_ARG}.
+  foreach(OVERRIDABLE_ARG in ${OVERRIDABLE_ARGS})
+    set(OVERRIDE_VALUE
+      ${OVERRIDABLE_FETCH_CONTENT_${CONTENT_NAME}_${OVERRIDABLE_ARG}}
+    )
+    if(NOT "${OVERRIDE_VALUE}" STREQUAL "")
+      set(ARGS_${OVERRIDABLE_ARG} "${OVERRIDE_VALUE}")
+      message(VERBOSE "Overriding ${OVERRIDABLE_ARG} of content "
+        "${CONTENT_NAME} with '${OVERRIDE_VALUE}'"
+      )
+    endif()
+  endforeach()
+
+  # If specified, save the source URL so it's possible to synthesize a link to
+  # the license when the content is populated.
+  if(ARGS_GIT_REPOSITORY AND ARGS_GIT_TAG)
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      GIT_REPOSITORY
+      "Git repo for ${CONTENT_NAME}"
+      "${ARGS_GIT_REPOSITORY}"
+    )
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      GIT_COMMIT
+      "Git commit for ${CONTENT_NAME}"
+      "${ARGS_GIT_TAG}"
+    )
+  endif()
+
+  # Set the license file and URL properties.
+  if(ARGS_LICENSE_URL)
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      LICENSE_URL
+      "License URL for ${CONTENT_NAME}"
+      "${ARGS_LICENSE_URL}"
+    )
+  endif()
+  if(ARGS_LICENSE_FILE)
+    _LicenseFileToUrl(
+      "${CONTENT_NAME}"
+      "${ARGS_LICENSE_FILE}"
+      ARGS_LICENSE_URL
+    )
+  endif()
+
+  # Try mapping to an archive URL.
+  set(ARCHIVE_URL "")
+  if(ARGS_GIT_REPOSITORY AND ARGS_GIT_TAG)
+    _GitRepoArchiveUrl(
+      "${ARGS_GIT_REPOSITORY}"
+      "${ARGS_GIT_TAG}"
+      OFF
+      ARCHIVE_URL
+    )
+    # If conversion from git repository to archive URL is enabled.
+    if(OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL_${CONTENT_NAME}
+       OR OVERRIDABLE_FETCH_CONTENT_GIT_REPOSITORY_AND_TAG_TO_URL)
+      # Try converting to an archive URL.
+      if(NOT ARGS_URL)
+        _GitRepoArchiveUrl(
+          "${ARGS_GIT_REPOSITORY}"
+          "${ARGS_GIT_TAG}"
+          ON
+          ARGS_URL
+        )
+        set(ARCHIVE_URL "${ARGS_URL}")
+      endif()
+    endif()
+  endif()
+
+  # If a download URL and git repository with tag are specified either use
+  # the git repo or the download URL.
+  if(ARGS_URL AND ARGS_GIT_REPOSITORY)
+    if(OVERRIDABLE_FETCH_CONTENT_USE_GIT)
+      unset(ARGS_URL)
+      unset(ARGS_URL_HASH)
+      unset(ARGS_URL_MD5)
+    else()
+      unset(ARGS_GIT_REPOSITORY)
+      unset(ARGS_GIT_TAG)
+    endif()
+  endif()
+
+  # Optionally map the archive URL to a mirror.
+  if(ARGS_URL)
+    _ApplyReplacements(
+      "OVERRIDABLE_FETCH_CONTENT_${CONTENT_NAME}"
+      "${ARGS_URL}"
+      REPLACED
+    )
+    if(REPLACED)
+      set(ARGS_URL "${REPLACED}")
+    endif()
+  endif()
+
+  # Save the archive URL.
+  if(ARGS_URL)
+    set(ARCHIVE_URL "${ARGS_URL}")
+  endif()
+  if(ARCHIVE_URL)
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      ARCHIVE_URL
+      "Archive URL for ${CONTENT_NAME}"
+      "${ARCHIVE_URL}"
+    )
+  endif()
+
+  # Build the list of arguments to pass to FetchContent_Declare() starting with
+  # the overridable arguments.
+  set(OUTPUT_ARGS "")
+  foreach(OVERRIDABLE_ARG ${OVERRIDABLE_ARGS})
+    set(OVERRIDABLE_ARG_VALUE "${ARGS_${OVERRIDABLE_ARG}}")
+    if(OVERRIDABLE_ARG_VALUE)
+      list(APPEND OUTPUT_ARGS ${OVERRIDABLE_ARG} "${OVERRIDABLE_ARG_VALUE}")
+    endif()
+  endforeach()
+  list(APPEND OUTPUT_ARGS ${ARGS_UNPARSED_ARGUMENTS})
+
+  # Add all defined packages to a global property.
+  get_property(OVERRIDABLE_FETCH_CONTENT_LIST GLOBAL PROPERTY
+    OVERRIDABLE_FETCH_CONTENT_LIST
+  )
+  set(DOCUMENTATION "List of all fetched content")
+  define_property(GLOBAL PROPERTY OVERRIDABLE_FETCH_CONTENT_LIST
+    BRIEF_DOCS "${DOCUMENTATION}"
+    FULL_DOCS "${DOCUMENTATION}"
+  )
+  list(APPEND OVERRIDABLE_FETCH_CONTENT_LIST "${CONTENT_NAME}")
+  set_property(GLOBAL PROPERTY OVERRIDABLE_FETCH_CONTENT_LIST
+    "${OVERRIDABLE_FETCH_CONTENT_LIST}"
+  )
+
+  message(VERBOSE "FetchContent_Declare(${CONTENT_NAME} ${OUTPUT_ARGS}")
+  FetchContent_Declare("${CONTENT_NAME}" ${OUTPUT_ARGS})
+endfunction()
+
+
+# Get a property name for this module.
+# CONTENT_NAME: Name of the content associated with the FetchContent function.
+# PROPERTY_NAME: Name of the property.
+# OUTPUT_VAR: Variable to store the name in.
+function(_OverridableFetchContent_GetPropertyName CONTENT_NAME PROPERTY_NAME
+    OUTPUT_VAR)
+  # The implementation of FetchContent_GetProperties() uses the lower case
+  # content name to prefix property names so follow the same pattern here.
+  string(TOLOWER "${CONTENT_NAME}" CONTENT_NAME_LOWER)
+  set(${OUTPUT_VAR}
+    "_OverridableFetchContent_${CONTENT_NAME_LOWER}_${PROPERTY_NAME}"
+    PARENT_SCOPE
+  )
+endfunction()
+
+
+# Set a global property for this module.
+# CONTENT_NAME: Name of the content associated with the FetchContent function.
+# PROPERTY_NAME: Name of the property to set.
+# DOCUMENTATION: Documentation string for the property.
+# PROPERTY_VALUE: Value to set the property to.
+function(_OverridableFetchContent_SetProperty CONTENT_NAME PROPERTY_NAME
+    DOCUMENTATION PROPERTY_VALUE)
+  _OverridableFetchContent_GetPropertyName(
+    "${CONTENT_NAME}"
+    "${PROPERTY_NAME}"
+    GLOBAL_PROPERTY_NAME
+  )
+  define_property(GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}"
+    BRIEF_DOCS "${DOCUMENTATION}"
+    FULL_DOCS "${DOCUMENTATION}"
+  )
+  set_property(GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}" "${PROPERTY_VALUE}")
+endfunction()
+
+
+# Get a global property for this module.
+# CONTENT_NAME: Name of the content associated with the FetchContent function.
+# PROPERTY_NAME: Name of the property to get.
+# OUTPUT_VAR: Variable to store the value in.
+function(_OverridableFetchContent_GetProperty CONTENT_NAME PROPERTY_NAME
+    OUTPUT_VAR)
+  _OverridableFetchContent_GetPropertyName(
+    "${CONTENT_NAME}"
+    "${PROPERTY_NAME}"
+    GLOBAL_PROPERTY_NAME
+  )
+  get_property(VALUE GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}")
+  if(VALUE)
+    set(${OUTPUT_VAR} "${VALUE}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+# Export a list of variables to the parent scope of the caller function.
+macro(_OverridableFetchContent_ExportToParentScope)
+  # Export requested variables to the parent scope.
+  foreach(VARIABLE_NAME ${ARGN})
+    if(${VARIABLE_NAME})
+      message(DEBUG "Export ${VARIABLE_NAME} ${${VARIABLE_NAME}}")
+      set(${VARIABLE_NAME} "${${VARIABLE_NAME}}" PARENT_SCOPE)
+    endif()
+  endforeach()
+endmacro()
+
+
+# Wrapper around FetchContent_GetProperties().
+#
+# Sets the same variables as FetchContent_GetProperties() in addition to:
+# * <contentName>_LICENSE_FILE: License file relative to
+#   <contentName>_SOURCE_DIR if found.
+# * <contentName>_LICENSE_URL: License URL if the file is found.
+# * <contentName_ARCHIVE_URL: URL to the source package.
+function(OverridableFetchContent_GetProperties CONTENT_NAME)
+  set(EXPORT_VARIABLE_ARGS SOURCE_DIR BINARY_DIR POPULATED)
+  cmake_parse_arguments(ARGS
+    ""
+    "${EXPORT_VARIABLE_ARGS}"
+    ""
+    ${ARGN}
+  )
+
+  # The implementation of FetchContent_Populate() uses the lower case
+  # content name to prefix returned variable names.
+  string(TOLOWER "${CONTENT_NAME}" CONTENT_NAME_LOWER)
+  # Get the names of the variables to export to the parent scope.
+  set(EXPORT_VARIABLES "")
+  set(OUTPUT_ARGS "")
+  foreach(ARG_NAME ${EXPORT_VARIABLE_ARGS})
+    set(ARG_VARIABLE_NAME "ARGS_${ARG_NAME}")
+    set(ARG_VARIABLE_VALUE "${${ARG_VARIABLE_NAME}}")
+    list(APPEND EXPORT_VARIABLES "${CONTENT_NAME_LOWER}_${ARG_NAME}")
+    if(ARG_VARIABLE_VALUE)
+      list(APPEND EXPORT_VARIABLES "${ARG_VARIABLE_VALUE}")
+      list(APPEND OUTPUT_ARGS "${ARG_NAME}" "${ARG_VARIABLE_VALUE}")
+    endif()
+  endforeach()
+  list(APPEND OUTPUT_ARGS ${ARGS_UNPARSED_ARGUMENTS})
+
+  foreach(EXPORT_PROPERTY LICENSE_FILE LICENSE_URL ARCHIVE_URL)
+    _OverridableFetchContent_GetProperty("${CONTENT_NAME}"
+      "${EXPORT_PROPERTY}"
+      "${EXPORT_PROPERTY}"
+    )
+    set(PROPERTY_VALUE "${${EXPORT_PROPERTY}}")
+    if(PROPERTY_VALUE)
+      set(${CONTENT_NAME}_${EXPORT_PROPERTY} "${PROPERTY_VALUE}" PARENT_SCOPE)
+    endif()
+  endforeach()
+  FetchContent_GetProperties("${CONTENT_NAME}" ${OUTPUT_ARGS})
+  _OverridableFetchContent_ExportToParentScope(${EXPORT_VARIABLES})
+endfunction()
+
+
+# Replacement for FetchContent_Populate() that searches a newly cloned
+# repository for a top level license file and provides it to the caller
+# via the <contentName>_LICENSE_FILE and <contentName>_LICENSE_URL variables
+# where <contentName> is the value passed as the CONTENT_NAME argument of this
+# method.
+#
+# To ensure a fetched repo has a license file and URL
+# OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK_<contentName> to ON for one
+# repository or OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK to ON for all
+# repositories.
+function(OverridableFetchContent_Populate CONTENT_NAME)
+  # The implementation of FetchContent_Populate() uses the lower case
+  # content name to prefix returned variable names.
+  string(TOLOWER "${CONTENT_NAME}" CONTENT_NAME_LOWER)
+
+  FetchContent_Populate("${CONTENT_NAME}")
+  OverridableFetchContent_GetProperties("${CONTENT_NAME}")
+
+  # If a license file isn't cached try finding it in the repo.
+  set(LICENSE_FILE "${${CONTENT_NAME_LOWER}_LICENSE_FILE}")
+  set(LICENSE_URL "${${CONTENT_NAME_LOWER}_LICENSE_URL}")
+  if(${CONTENT_NAME}_POPULATED AND NOT LICENSE_FILE)
+    set(SOURCE_DIR "${${CONTENT_NAME_LOWER}_SOURCE_DIR}")
+    find_file(_${CONTENT_NAME_LOWER}_LICENSE_FILE
+      NAMES LICENSE LICENSE.md LICENSE.txt NOTICE COPYING
+      PATHS "${SOURCE_DIR}"
+      DOC "${CONTENT_NAME} license file"
+      NO_DEFAULT_PATH
+      NO_CMAKE_FIND_ROOT_PATH
+    )
+    set(LICENSE_FILE "${_${CONTENT_NAME_LOWER}_LICENSE_FILE}")
+    if(LICENSE_FILE)
+      file(RELATIVE_PATH LICENSE_FILE "${SOURCE_DIR}" "${LICENSE_FILE}")
+      file(TO_CMAKE_PATH "${LICENSE_FILE}" LICENSE_FILE)
+    endif()
+  endif()
+  # If a LICENSE_FILE was found populate the URL.
+  if(LICENSE_FILE AND NOT LICENSE_URL)
+    _LicenseFileToUrl(
+      "${CONTENT_NAME}"
+      "${LICENSE_FILE}"
+      LICENSE_URL
+    )
+  endif()
+
+  # If enabled, check for source licenses.
+  if(OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK OR
+     OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK_${CONTENT_NAME})
+    message(DEBUG "LICENSE_FILE: ${LICENSE_FILE}, LICENSE_URL: ${LICENSE_URL}")
+    if(NOT LICENSE_FILE)
+      message(FATAL_ERROR
+        "Required license file not found for ${CONTENT_NAME}"
+      )
+    endif()
+    if(NOT LICENSE_URL)
+      message(FATAL_ERROR
+        "Required license URL not found for ${CONTENT_NAME}"
+      )
+    endif()
+  endif()
+
+  # Export return values to the parent scope.
+  set(EXPORT_VARIABLES "")
+  foreach(VARIABLE_POSTFIX SOURCE_DIR BINARY_DIR POPULATED)
+    list(APPEND EXPORT_VARIABLES "${CONTENT_NAME_LOWER}_${VARIABLE_POSTFIX}")
+  endforeach()
+  _OverridableFetchContent_ExportToParentScope(${EXPORT_VARIABLES})
+endfunction()
diff --git a/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake b/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
new file mode 100644
index 00000000000..5f362f45c75
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/abseil-cpp.cmake
@@ -0,0 +1,44 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Use absl_base as a proxy for the project being included.
+if(TARGET absl_base OR abseil-cpp_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  abseil-cpp
+  GIT_REPOSITORY https://github.com/abseil/abseil-cpp
+  GIT_TAG 20200225.2 # TODO: What version does GRPC and TFLite need?
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/abseil-cpp"
+)
+OverridableFetchContent_GetProperties(abseil-cpp)
+if(NOT abseil-cpp_POPULATED)
+  OverridableFetchContent_Populate(abseil-cpp)
+endif()
+
+set(ABSL_USE_GOOGLETEST_HEAD OFF CACHE BOOL "Disable googletest")
+set(ABSL_RUN_TESTS OFF CACHE BOOL "Disable build of ABSL tests")
+add_subdirectory(
+  "${abseil-cpp_SOURCE_DIR}"
+  "${abseil-cpp_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+
diff --git a/tensorflow/lite/tools/cmake/modules/absl-config.cmake b/tensorflow/lite/tools/cmake/modules/absl-config.cmake
new file mode 100644
index 00000000000..75041749bd1
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/absl-config.cmake
@@ -0,0 +1,187 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# grpc uses find_package in CONFIG mode for this package, so override the
+# system installation and build from source instead.
+include(abseil-cpp)
+if(abseil-cpp_POPULATED)
+  set(_ABSL_LIBRARY_NAMES
+    algorithm
+    algorithm_container
+    any
+    atomic_hook
+    atomic_hook_test_helper
+    awesome
+    bad_any_cast
+    bad_any_cast_impl
+    bad_optional_access
+    bad_variant_access
+    base
+    base_internal
+    bind_front
+    bits
+    btree
+    btree_test_common
+    city
+    civil_time
+    compare
+    compressed_tuple
+    config
+    conformance_testing
+    container
+    container_common
+    container_memory
+    cord
+    cord_test_helpers
+    core_headers
+    counting_allocator
+    debugging
+    debugging_internal
+    demangle_internal
+    dynamic_annotations
+    endian
+    errno_saver
+    examine_stack
+    exception_safety_testing
+    exception_testing
+    exponential_biased
+    failure_signal_handler
+    fantastic_lib
+    fast_type_id
+    fixed_array
+    flags
+    flags_commandlineflag
+    flags_commandlineflag_internal
+    flags_config
+    flags_internal
+    flags_marshalling
+    flags_parse
+    flags_path_util
+    flags_private_handle_accessor
+    flags_program_name
+    flags_reflection
+    flags_usage
+    flags_usage_internal
+    flat_hash_map
+    flat_hash_set
+    function_ref
+    graphcycles_internal
+    hash
+    hash_function_defaults
+    hash_generator_testing
+    hash_policy_testing
+    hash_policy_traits
+    hash_testing
+    hashtable_debug
+    hashtable_debug_hooks
+    hashtablez_sampler
+    have_sse
+    hdrs
+    inlined_vector
+    inlined_vector_internal
+    int128
+    kernel_timeout_internal
+    layout
+    leak_check
+    leak_check_api_disabled_for_testing
+    leak_check_api_enabled_for_testing
+    leak_check_disable
+    log_severity
+    main_lib
+    malloc_internal
+    memory
+    meta
+    node_hash_map
+    node_hash_policy
+    node_hash_set
+    numeric
+    optional
+    per_thread_sem_test_common
+    periodic_sampler
+    pow10_helper
+    pretty_function
+    random_bit_gen_ref
+    random_distributions
+    random_internal_distribution_caller
+    random_internal_distribution_test_util
+    random_internal_explicit_seed_seq
+    random_internal_fast_uniform_bits
+    random_internal_fastmath
+    random_internal_generate_real
+    random_internal_iostream_state_saver
+    random_internal_mock_helpers
+    random_internal_mock_overload_set
+    random_internal_nonsecure_base
+    random_internal_pcg_engine
+    random_internal_platform
+    random_internal_pool_urbg
+    random_internal_randen
+    random_internal_randen_engine
+    random_internal_randen_hwaes
+    random_internal_randen_hwaes_impl
+    random_internal_randen_slow
+    random_internal_salted_seed_seq
+    random_internal_seed_material
+    random_internal_sequence_urbg
+    random_internal_traits
+    random_internal_uniform_helper
+    random_internal_wide_multiply
+    random_mocking_bit_gen
+    random_random
+    random_seed_gen_exception
+    random_seed_sequences
+    raw_hash_map
+    raw_hash_set
+    raw_logging_internal
+    scoped_set_env
+    span
+    spinlock_test_common
+    spinlock_wait
+    spy_hash_state
+    stack_consumption
+    stacktrace
+    status
+    str_format
+    str_format_internal
+    strerror
+    strings
+    strings_internal
+    symbolize
+    synchronization
+    test_instance_tracker
+    thread_pool
+    throw_delegate
+    time
+    time_internal_test_util
+    time_zone
+    tracked
+    type_traits
+    unordered_map_constructor_test
+    unordered_map_lookup_test
+    unordered_map_members_test
+    unordered_map_modifiers_test
+    unordered_set_constructor_test
+    unordered_set_lookup_test
+    unordered_set_members_test
+    unordered_set_modifiers_test
+    utility
+    variant
+  )
+  set(_ABSL_LIBRARIES ${_ABSL_LIBRARY_NAMES})
+  foreach(_LIBRARY ${_ABSL_LIBRARY_NAMES})
+    list(APPEND _ABSL_LIBRARIES "absl::${LIBRARY}")
+  endforeach()
+  set(ABSL_LIBRARIES ${ABSL_LIBRARIES} CACHE STRING "absl libs")
+endif()
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
new file mode 100644
index 00000000000..6ad7949f350
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -0,0 +1,95 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET eigen OR eigen_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  eigen
+  GIT_REPOSITORY https://gitlab.com/libeigen/eigen
+  # TODO: Verify this is the version required by TFLite
+  GIT_TAG b9362fb8f76fbba805b56afbc0f5de0a279631b5
+  # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
+  # as cmake attempts to git checkout the commit hash after the clone
+  # which doesn't work as it's a shallow clone hence a different commit hash.
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/17770
+  # GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/eigen"
+  LICENSE_FILE "COPYING.MPL2"
+)
+OverridableFetchContent_GetProperties(eigen)
+if(NOT eigen_POPULATED)
+  OverridableFetchContent_Populate(eigen)
+endif()
+
+# Patch Eigen to disable Fortran compiler check for BLAS and LAPACK tests.
+if(NOT EIGEN_DISABLED_FORTRAN_COMPILER_CHECK)
+  file(WRITE "${eigen_SOURCE_DIR}/cmake/language_support.cmake" "
+      function(workaround_9220 language language_works)
+        set(\${language_works} OFF PARENT_SCOPE)
+      endfunction()"
+  )
+endif()
+# Patch Eigen to disable benchmark suite.
+if(NOT EIGEN_BUILD_BTL)
+  file(WRITE "${eigen_SOURCE_DIR}/bench/spbench/CMakeLists.txt" "")
+endif()
+
+set(EIGEN_DISABLED_FORTRAN_COMPILER_CHECK ON CACHE BOOL "Disabled Fortran")
+
+set(EIGEN_LEAVE_TEST_IN_ALL_TARGET OFF CACHE BOOL
+  "Remove tests from all target."
+)
+set(BUILD_TESTING OFF CACHE BOOL "Disable tests.")
+set(EIGEN_TEST_CXX11 OFF CACHE BOOL "Disable tests of C++11 features.")
+set(EIGEN_BUILD_BTL OFF CACHE BOOL "Disable benchmark suite.")
+set(EIGEN_BUILD_PKGCONFIG OFF CACHE BOOL "Disable pkg-config.")
+set(EIGEN_SPLIT_LARGE_TESTS OFF CACHE BOOL "Disable test splitting.")
+set(EIGEN_DEFAULT_TO_ROW_MAJOR OFF CACHE BOOL
+  "Disable row-major matrix storage"
+)
+set(EIGEN_TEST_NOQT ON CACHE BOOL "Disable Qt support in tests.")
+set(EIGEN_TEST_SSE2 OFF CACHE BOOL "Disable SSE2 test.")
+set(EIGEN_TEST_SSE3 OFF CACHE BOOL "Disable SSE3 test.")
+set(EIGEN_TEST_SSSE3 OFF CACHE BOOL "Disable SSSE3 test.")
+set(EIGEN_TEST_SSE4_1 OFF CACHE BOOL "Disable SSE4.1 test.")
+set(EIGEN_TEST_SSE4_2 OFF CACHE BOOL "Disable SSE4.2 test.")
+set(EIGEN_TEST_AVX OFF CACHE BOOL "Disable AVX test.")
+set(EIGEN_TEST_FMA OFF CACHE BOOL "Disable FMA test.")
+set(EIGEN_TEST_AVX512 OFF CACHE BOOL "Disable AVX512 test.")
+set(EIGEN_TEST_F16C OFF CACHE BOOL "Disable F16C test.")
+set(EIGEN_TEST_ALTIVEC OFF CACHE BOOL "Disable AltiVec test.")
+set(EIGEN_TEST_VSX OFF CACHE BOOL "Disable VSX test.")
+set(EIGEN_TEST_MSA OFF CACHE BOOL "Disable MSA test.")
+set(EIGEN_TEST_NEON OFF CACHE BOOL "Disable NEON test.")
+set(EIGEN_TEST_NEON64 OFF CACHE BOOL "Disable NEON64 test.")
+set(EIGEN_TEST_Z13 OFF CACHE BOOL "Disable Z13 test.")
+set(EIGEN_TEST_Z14 OFF CACHE BOOL "Disable Z14 test.")
+set(EIGEN_TEST_OPENMP OFF CACHE BOOL "Disable OpenMP test.")
+set(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION OFF CACHE BOOL "Disable vectorization")
+set(EIGEN_TEST_X87 OFF CACHE BOOL "Disable X87 instructions test")
+set(EIGEN_TEST_32BIT OFF CACHE BOOL "Disable 32-bit instructions test")
+set(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT OFF CACHE BOOL "Disable alignment test")
+set(EIGEN_TEST_NO_EXCEPTIONS OFF CACHE BOOL "Disable alignment test")
+set(EIGEN_TEST_SYCL OFF CACHE BOOL "Disable Sycl test")
+set(EIGEN_SYCL_TRISYCL OFF CACHE BOOL "Disable triSYCL test")
+# Make sure only MPL2.0 or more permissively licensed code is included.
+add_compile_definitions(EIGEN_MPL2_ONLY)
+add_subdirectory("${eigen_SOURCE_DIR}" "${eigen_BINARY_DIR}" EXCLUDE_FROM_ALL)
diff --git a/tensorflow/lite/tools/cmake/modules/farmhash.cmake b/tensorflow/lite/tools/cmake/modules/farmhash.cmake
new file mode 100644
index 00000000000..09ec7bdf64f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/farmhash.cmake
@@ -0,0 +1,48 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET farmhash OR farmhash_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  farmhash
+  GIT_REPOSITORY https://github.com/google/farmhash
+  # TODO: Reference the source of this.
+  GIT_TAG 816a4ae622e964763ca0862d9dbd19324a1eaf45
+  # It's not currently possible to shallow clone with a GIT TAG
+  # as cmake attempts to git checkout the commit hash after the clone
+  # which doesn't work as it's a shallow clone hence a different commit hash.
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/17770
+  # GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/farmhash"
+)
+OverridableFetchContent_GetProperties(farmhash)
+if(NOT farmhash_POPULATED)
+  OverridableFetchContent_Populate(farmhash)
+endif()
+
+set(FARMHASH_SOURCE_DIR "${farmhash_SOURCE_DIR}" CACHE PATH
+  "Source directory for the CMake project."
+)
+
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/farmhash"
+  "${farmhash_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/farmhash/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/farmhash/CMakeLists.txt
new file mode 100644
index 00000000000..7029926b6d4
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/farmhash/CMakeLists.txt
@@ -0,0 +1,39 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(farmhash CXX)
+
+set(FARMHASH_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the farmhash project"
+)
+if(NOT FARMHASH_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+# Transcribed from farmhash/src/Makefile.am
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+  "int main(int argc, char* argv[]) { return (int)__builtin_expect(0, 0); }"
+  FARMHASH_HAS_BUILTIN_EXPECT
+)
+
+add_library(farmhash
+  "${FARMHASH_SOURCE_DIR}/src/farmhash.cc"
+  "${FARMHASH_SOURCE_DIR}/src/farmhash.h"
+)
+target_include_directories(farmhash PUBLIC "${FARMHASH_SOURCE_DIR}/src")
+if(NOT FARMHASH_HAS_BUILTIN_EXPECT)
+  target_compile_definitions(farmhash PUBLIC -DFARMHASH_NO_BUILTIN_EXPECT)
+endif()
diff --git a/tensorflow/lite/tools/cmake/modules/fft2d.cmake b/tensorflow/lite/tools/cmake/modules/fft2d.cmake
new file mode 100644
index 00000000000..93ac8c1419f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/fft2d.cmake
@@ -0,0 +1,41 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET fft2d OR fft2d_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  fft2d
+  URL https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz
+  # TODO: Reference where this comes from.
+  URL_HASH SHA256=ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/fft2d"
+  LICENSE_FILE "readme2d.txt"
+  LICENSE_URL "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html"
+)
+OverridableFetchContent_GetProperties(fft2d)
+if(NOT fft2d_POPULATED)
+  OverridableFetchContent_Populate(fft2d)
+endif()
+
+set(FFT2D_SOURCE_DIR "${fft2d_SOURCE_DIR}" CACHE PATH "fft2d source")
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/fft2d"
+  "${fft2d_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/fft2d/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/fft2d/CMakeLists.txt
new file mode 100644
index 00000000000..e7a5ed9b443
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/fft2d/CMakeLists.txt
@@ -0,0 +1,54 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(fft2d C)
+
+set(FFT2D_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the fft2d project"
+)
+if(NOT FFT2D_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+# fft2d doesn't have a CMake project so define it here transcribed from
+# sample2d/Makefile.
+
+# A developer should link this library if they haven't provided their own
+# implementation of these allocation methods.
+add_library(fft2d_alloc
+  "${FFT2D_SOURCE_DIR}/alloc.c"
+  "${FFT2D_SOURCE_DIR}/alloc.h"
+)
+target_include_directories(fft2d_alloc PUBLIC "${FFT2D_SOURCE_DIR}")
+
+# Requires implementation of fft2d_alloc.
+add_library(fft2d_fft4f2d "${FFT2D_SOURCE_DIR}/fft4f2d.c")
+target_include_directories(fft2d_fft4f2d PRIVATE "${FFT2D_SOURCE_DIR}")
+
+add_library(fft2d_fftsg "${FFT2D_SOURCE_DIR}/fftsg.c")
+
+# Requires implementation of fft2d_alloc.
+add_library(fft2d_fftsg2d "${FFT2D_SOURCE_DIR}/fftsg2d.c")
+target_link_libraries(fft2d_fftsg2d fft2d_fftsg)
+target_include_directories(fft2d_fftsg2d PRIVATE "${FFT2D_SOURCE_DIR}")
+
+# Requires implementation of fft2d_alloc.
+add_library(fft2d_fftsg3d "${FFT2D_SOURCE_DIR}/fftsg3d.c")
+target_link_libraries(fft2d_fftsg3d fft2d_fftsg)
+target_include_directories(fft2d_fftsg3d PRIVATE "${FFT2D_SOURCE_DIR}")
+
+add_library(fft2d_shrtdct "${FFT2D_SOURCE_DIR}/shrtdct.c")
+
+add_library(fft2d ALIAS fft2d_fftsg2d)
diff --git a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
new file mode 100644
index 00000000000..38380ca43ae
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
@@ -0,0 +1,43 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET flatbuffers OR flatbuffers_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  flatbuffers
+  GIT_REPOSITORY https://github.com/google/flatbuffers
+  GIT_TAG v1.12.0 # TODO: What version does TFLite need?
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/flatbuffers"
+)
+OverridableFetchContent_GetProperties(flatbuffers)
+if(NOT flatbuffers_POPULATED)
+  OverridableFetchContent_Populate(flatbuffers)
+endif()
+
+# Required for Windows, since it has macros called min & max which
+# clashes with std::min
+add_definitions(-DNOMINMAX=1)
+add_subdirectory(
+  "${flatbuffers_SOURCE_DIR}"
+  "${flatbuffers_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+remove_definitions(-DNOMINMAX)
diff --git a/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake b/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
new file mode 100644
index 00000000000..a0483ab62ef
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
@@ -0,0 +1,45 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET gemmlowp OR gemmlowp_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  gemmlowp
+  GIT_REPOSITORY https://github.com/google/gemmlowp
+  GIT_TAG fda83bdc38b118cc6b56753bd540caa49e570745
+  # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
+  # as cmake attempts to git checkout the commit hash after the clone
+  # which doesn't work as it's a shallow clone hence a different commit hash.
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/17770
+  # GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/gemmlowp"
+)
+
+OverridableFetchContent_GetProperties(gemmlowp)
+if(NOT gemmlowp_POPULATED)
+  OverridableFetchContent_Populate(gemmlowp)
+endif()
+
+set(GEMMLOWP_SOURCE_DIR "${gemmlowp_SOURCE_DIR}" CACHE PATH "Source directory")
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/gemmlowp"
+  "${gemmlowp_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/gemmlowp/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/gemmlowp/CMakeLists.txt
new file mode 100644
index 00000000000..0aa5ae1a4d3
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/gemmlowp/CMakeLists.txt
@@ -0,0 +1,87 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+project(gemmlowp CXX)
+
+option(GEMMLOWP_ADD_HEADERS_TO_TARGETS OFF
+  "Whether to add sources to gemmlowp's interface library targets.
+   This will cause all users of these libraries to also include these headers"
+)
+
+set(GEMMLOWP_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the gemmlowp project"
+)
+if(NOT GEMMLOWP_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+# gemmlowp doesn't have a CMake project so this is transcribed from
+# gemmlowp/BUILD.
+
+file(GLOB GEMMLOWP_EIGHTBITINT_HEADERS
+  "${GEMMLOWP_SOURCE_DIR}/eight_bit_int_gemm/*.h"
+  )
+file(GLOB GEMMLOWP_EIGHTBITINT_SOURCES
+  "${GEMMLOWP_SOURCE_DIR}/eight_bit_int_gemm/*.cc"
+)
+file(GLOB GEMMLOWP_FIXEDPOINT_HEADERS "${GEMMLOWP_SOURCE_DIR}/fixedpoint/*.h")
+file(GLOB GEMMLOWP_INTERNAL_HEADERS "${GEMMLOWP_SOURCE_DIR}/internal/*.h")
+file(GLOB GEMMLOWP_META_HEADERS "${GEMMLOWP_SOURCE_DIR}/meta/*.h")
+file(GLOB GEMMLOWP_PROFILING_HEADERS "${GEMMLOWP_SOURCE_DIR}/profiling/*.h")
+file(GLOB GEMMLOWP_PUBLIC_HEADERS "${GEMMLOWP_SOURCE_DIR}/public/*.h")
+
+set(GEMMLOWP_PRIVATE_HEADERS "")
+list(APPEND GEMMLOWP_PRIVATE_HEADERS ${GEMMLOWP_FIXEDPOINT_HEADERS})
+list(APPEND GEMMLOWP_PRIVATE_HEADERS ${GEMMLOWP_INTERNAL_HEADERS})
+
+add_library(gemmlowp_private INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp_private INTERFACE ${GEMMLOWP_PRIVATE_HEADERS})
+endif()
+target_include_directories(gemmlowp_private INTERFACE "${GEMMLOWP_SOURCE_DIR}")
+
+add_library(gemmlowp INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp INTERFACE ${GEMMLOWP_PUBLIC_HEADERS})
+endif()
+target_include_directories(gemmlowp INTERFACE "${GEMMLOWP_SOURCE_DIR}/public")
+target_link_libraries(gemmlowp INTERFACE gemmlowp_private)
+
+add_library(gemmlowp_eight_bit_int_gemm
+  ${GEMMLOWP_EIGHTBITINT_SOURCES}
+  ${GEMMLOWP_EIGHTBITINT_HEADERS}
+)
+target_include_directories(gemmlowp_eight_bit_int_gemm
+  PUBLIC "${GEMMLOWP_SOURCE_DIR}/eight_bit_int_gemm"
+)
+
+add_library(gemmlowp_fixedpoint INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp_fixedpoint INTERFACE ${GEMMLOWP_FIXEDPOINT_HEADERS})
+endif()
+target_include_directories(gemmlowp_fixedpoint
+  INTERFACE "${GEMMLOWP_SOURCE_DIR}/fixedpoint"
+)
+target_link_libraries(gemmlowp_fixedpoint INTERFACE gemmlowp_private)
+
+add_library(gemmlowp_profiler INTERFACE)
+if(GEMMLOWP_ADD_HEADERS_TO_TARGETS)
+  target_sources(gemmlowp_profiler INTERFACE ${GEMMLOWP_PROFILING_HEADERS})
+endif()
+target_include_directories(gemmlowp_profiler
+  INTERFACE "${GEMMLOWP_SOURCE_DIR}/profiling"
+)
+
+
diff --git a/tensorflow/lite/tools/cmake/modules/neon2sse.cmake b/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
new file mode 100644
index 00000000000..505835b53f0
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+if(TARGET neon2sse OR neon2sse_POPULATED)
+  return()
+endif()
+
+OverridableFetchContent_Declare(
+  neon2sse
+  GIT_REPOSITORY https://github.com/intel/ARM_NEON_2_x86_SSE
+  GIT_TAG master
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/neon2sse"
+)
+
+OverridableFetchContent_GetProperties(neon2sse)
+if(NOT neon2sse_POPULATED)
+  OverridableFetchContent_Populate(neon2sse)
+endif()
+
+add_subdirectory(
+  "${neon2sse_SOURCE_DIR}"
+  "${neon2sse_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/ruy.cmake b/tensorflow/lite/tools/cmake/modules/ruy.cmake
new file mode 100644
index 00000000000..02a99cd7bab
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/ruy.cmake
@@ -0,0 +1,41 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET ruy OR ruy_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  ruy
+  GIT_REPOSITORY https://github.com/google/ruy
+  GIT_TAG master # TODO
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/ruy"
+)
+OverridableFetchContent_GetProperties(ruy)
+if(NOT ruy_POPULATED)
+  OverridableFetchContent_Populate(ruy)
+endif()
+
+set(RUY_SOURCE_DIR "${ruy_SOURCE_DIR}" CACHE PATH "RUY source directory")
+
+add_subdirectory(
+  "${CMAKE_CURRENT_LIST_DIR}/ruy"
+  "${ruy_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt
new file mode 100644
index 00000000000..d88d0470e22
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt
@@ -0,0 +1,38 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.16)
+
+project(ruy CXX)
+
+set(CMAKE_CXX_STANDARD 14)  # Some components require C++14.
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(RUY_SOURCE_DIR "" CACHE PATH
+  "Directory that contains the RUY project"
+)
+if(NOT RUY_SOURCE_DIR)
+  message(FATAL_ERROR "Must specify source directory")
+endif()
+
+file(GLOB RUY_SOURCES "${RUY_SOURCE_DIR}/ruy/*.*")
+list(FILTER RUY_SOURCES INCLUDE REGEX ".*\\.(c|cc|h)$")
+list(FILTER RUY_SOURCES EXCLUDE REGEX ".*(_test)\\.(c|cc|h)$")
+list(FILTER RUY_SOURCES EXCLUDE REGEX ".*/(benchmark|example|test_.*)\.cc$")
+list(FILTER RUY_SOURCES EXCLUDE REGEX ".*/gtest_wrapper\\.h$")
+
+add_library(ruy ${RUY_SOURCES})
+target_include_directories(ruy PUBLIC "${RUY_SOURCE_DIR}")
+
diff --git a/tensorflow/lite/tools/command_line_flags.cc b/tensorflow/lite/tools/command_line_flags.cc
index 4f646ae27f4..c7affe435f5 100644
--- a/tensorflow/lite/tools/command_line_flags.cc
+++ b/tensorflow/lite/tools/command_line_flags.cc
@@ -185,7 +185,10 @@ std::string Flag::GetTypeName() const {
 
     const auto it = processed_flags.find(flag.name_);
     if (it != processed_flags.end()) {
+#ifndef NDEBUG
+      // Only log this in debug builds.
       TFLITE_LOG(WARN) << "Duplicate flags: " << flag.name_;
+#endif
       if (it->second != -1) {
         bool value_parsing_ok;
         flag.Parse(argv[it->second], &value_parsing_ok);
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 5ab48d570f5..4a7b4a59e39 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -70,6 +70,11 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
   BuiltinOperator op_code = op_variant.op_code;
   OperatorProperty property;
   switch (op_code) {
+    case BuiltinOperator_ABS:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
     case BuiltinOperator_ADD:
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index ef4825c397e..7edf459eb90 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -368,6 +368,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_ABS:
     case BuiltinOperator_RELU:
       if (op_sig.input_types.at(0) == TensorType_INT8 ||
           op_sig.input_types.at(0) == TensorType_UINT8) {
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index a90cb336318..82ebad701cd 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -710,4 +710,21 @@ TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
   fake_op_sig.options.resize.align_corners = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
 }
+TEST(OpVersionTest, VersioningAbsTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_ABS,
+      .input_types = std::vector<TensorType>{TensorType_FLOAT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types = std::vector<TensorType>{TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+}
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 5a454224b92..a656356b84c 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -305,6 +305,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SQUARE, 1}, "1.12.0"},
               {{BuiltinOperator_ZEROS_LIKE, 1}, "1.12.0"},
               {{BuiltinOperator_ABS, 1}, "1.13.0"},
+              {{BuiltinOperator_ABS, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
               {{BuiltinOperator_FILL, 1}, "1.13.0"},
               {{BuiltinOperator_FILL, 2}, "2.3.0"},
diff --git a/tensorflow/lite/type_to_tflitetype.h b/tensorflow/lite/type_to_tflitetype.h
index a95b233c13c..8409a299082 100644
--- a/tensorflow/lite/type_to_tflitetype.h
+++ b/tensorflow/lite/type_to_tflitetype.h
@@ -15,56 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
 #define TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
 
-// Arduino build defines abs as a macro here. That is invalid C++, and breaks
-// libc++'s <complex> header, undefine it.
-#ifdef abs
-#undef abs
-#endif
-
-#include <complex>
 #include <string>
 
 #include "tensorflow/lite/c/common.h"
 
+// Most of the definitions have been moved to this subheader so that Micro
+// can include it without relying on <string>, which isn't available on all
+// platforms.
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+
 namespace tflite {
 
-// Map statically from a C++ type to a TfLiteType. Used in interpreter for
-// safe casts.
-// Example:
-//  typeToTfLiteType<bool>() -> kTfLiteBool
-template <typename T>
-constexpr TfLiteType typeToTfLiteType() {
-  return kTfLiteNoType;
-}
-// Map from TfLiteType to the corresponding C++ type.
-// Example:
-//   TfLiteTypeToType<kTfLiteBool>::Type -> bool
-template <TfLiteType TFLITE_TYPE_ENUM>
-struct TfLiteTypeToType {};  // Specializations below
-
-// Template specialization for both typeToTfLiteType and TfLiteTypeToType.
-#define MATCH_TYPE_AND_TFLITE_TYPE(CPP_TYPE, TFLITE_TYPE_ENUM) \
-  template <>                                                  \
-  constexpr TfLiteType typeToTfLiteType<CPP_TYPE>() {          \
-    return TFLITE_TYPE_ENUM;                                   \
-  }                                                            \
-  template <>                                                  \
-  struct TfLiteTypeToType<TFLITE_TYPE_ENUM> {                  \
-    using Type = CPP_TYPE;                                     \
-  }
-
-MATCH_TYPE_AND_TFLITE_TYPE(int, kTfLiteInt32);
-MATCH_TYPE_AND_TFLITE_TYPE(int16_t, kTfLiteInt16);
-MATCH_TYPE_AND_TFLITE_TYPE(int64_t, kTfLiteInt64);
-MATCH_TYPE_AND_TFLITE_TYPE(float, kTfLiteFloat32);
-MATCH_TYPE_AND_TFLITE_TYPE(unsigned char, kTfLiteUInt8);
-MATCH_TYPE_AND_TFLITE_TYPE(int8_t, kTfLiteInt8);
-MATCH_TYPE_AND_TFLITE_TYPE(bool, kTfLiteBool);
-MATCH_TYPE_AND_TFLITE_TYPE(std::complex<float>, kTfLiteComplex64);
-MATCH_TYPE_AND_TFLITE_TYPE(std::complex<double>, kTfLiteComplex128);
+// TODO(b/163167649): This string conversion means that only the first entry
+// in a string tensor will be returned as a std::string, so it's deprecated.
 MATCH_TYPE_AND_TFLITE_TYPE(std::string, kTfLiteString);
-MATCH_TYPE_AND_TFLITE_TYPE(TfLiteFloat16, kTfLiteFloat16);
-MATCH_TYPE_AND_TFLITE_TYPE(double, kTfLiteFloat64);
 
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_TYPE_TO_TFLITETYPE_H_
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index faf097e85f9..f5178056428 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -163,7 +163,6 @@ tensorflow/third_party/repo.bzl
 tensorflow/third_party/six.BUILD
 tensorflow/third_party/snappy.BUILD
 tensorflow/third_party/sqlite.BUILD
-tensorflow/third_party/swig.BUILD
 tensorflow/third_party/sycl/crosstool/BUILD
 tensorflow/third_party/systemlibs/BUILD
 tensorflow/third_party/systemlibs/BUILD.tpl
@@ -194,7 +193,6 @@ tensorflow/third_party/systemlibs/re2.BUILD
 tensorflow/third_party/systemlibs/six.BUILD
 tensorflow/third_party/systemlibs/snappy.BUILD
 tensorflow/third_party/systemlibs/sqlite.BUILD
-tensorflow/third_party/systemlibs/swig.BUILD
 tensorflow/third_party/systemlibs/syslibs_configure.bzl
 tensorflow/third_party/systemlibs/termcolor.BUILD
 tensorflow/third_party/systemlibs/zlib.BUILD
@@ -267,6 +265,7 @@ tensorflow/third_party/toolchains/remote_config/BUILD
 tensorflow/third_party/toolchains/remote_config/configs.bzl
 tensorflow/third_party/toolchains/remote_config/containers.bzl
 tensorflow/third_party/toolchains/remote_config/rbe_config.bzl
+tensorflow/third_party/typing_extensions.BUILD
 tensorflow/third_party/wrapt.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/tools/build_info/BUILD
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 30b964f6524..8ef9680fb2f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -679,19 +679,19 @@ tf_python_pybind_extension(
         "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
         "//tensorflow/c:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
         "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/core/framework:pywrap_required_hdrs",
     ],
     module_name = "_pywrap_tf_session",
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
         "//third_party/py/numpy:headers",
+        "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/core:protos_all_cc",
@@ -1022,7 +1022,6 @@ cc_library(
         "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
         "//tensorflow/c:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c/eager:headers",
     ],
     features = [
@@ -1033,6 +1032,7 @@ cc_library(
     ]),
     deps = [
         ":numpy_lib",
+        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/core:framework_internal_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
@@ -6917,6 +6917,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["training/adam_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -8343,24 +8344,24 @@ tf_python_pybind_extension(
         "util/util.h",
         ":py_exception_registry_hdr",
         "//tensorflow/c:headers",
-        "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
         "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/core/framework:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
     ],
     module_name = "_pywrap_tfe",
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index c3fc879ded5..98f766463ed 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -60,10 +60,10 @@ class ControlFlowTransformer(converter.Base):
   def _create_nonlocal_declarations(self, vars_):
     vars_ = set(vars_)
     results = []
-    global_vars = self.state[_Function].scope.globals
+    global_vars = self.state[_Function].scope.globals & vars_
 
     if global_vars:
-      results.append(gast.Global([str(v) for v in vars_]))
+      results.append(gast.Global([str(v) for v in global_vars]))
 
     nonlocal_vars = [
         v for v in vars_ if not v.is_composite() and v not in global_vars]
@@ -180,6 +180,7 @@ class ControlFlowTransformer(converter.Base):
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    fn_scope = self.state[_Function].scope
 
     basic_scope_vars = self._get_block_basic_vars(
         modified,
@@ -191,8 +192,9 @@ class ControlFlowTransformer(converter.Base):
     # Variables that are modified inside the scope, but not defined
     # before entering it. Only simple variables must be defined. The
     # composite ones will be implicitly checked at runtime.
-    # This covers loop variables as well as variables that
-    undefined = tuple(v for v in modified - defined_in if not v.is_composite())
+    possibly_undefined = (
+        modified - defined_in - fn_scope.globals - fn_scope.nonlocals)
+    undefined = tuple(v for v in possibly_undefined if not v.is_composite())
 
     # Variables that are modified inside the scope, and depend on values outside
     # it.
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 87f59bef675..497b3297335 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -38,6 +38,8 @@ from tensorflow.python.util import nest
 
 
 for_unaffected_global = None
+for_mixed_globals_nonglobals = None
+for_test_global_local = None
 
 
 class ControlFlowTestBase(converter_testing.TestCase):
@@ -76,6 +78,25 @@ class NestedControlFlowTest(ControlFlowTestBase):
     self.assertTransformedResult(f, constant_op.constant(5),
                                  (25, 5, 0, 5))
 
+  def test_mixed_globals_nonglobals(self):
+
+    def f(n):
+      global for_mixed_globals_nonglobals
+      i = 0
+      j = 0
+      for_mixed_globals_nonglobals = 0
+      while i < n:
+        while j < i:
+          j += 3
+        u = i + j  # 'u' is not defined within the inner loop
+        for_mixed_globals_nonglobals += u
+        i += 1
+        j = 0
+      return for_mixed_globals_nonglobals, i, j, n
+
+    self.assertTransformedResult(f, constant_op.constant(5),
+                                 (25, 5, 0, 5))
+
   def test_composite_state_complex(self):
 
     class TestClassX(object):
@@ -457,6 +478,23 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(f, constant_op.constant(1), 5)
     self.assertTransformedResult(f, constant_op.constant(-1), -1)
 
+  def test_global_local(self):
+
+    def f(n):
+      if n > 0:
+        global for_test_global_local
+        if for_test_global_local is None:
+          for_test_global_local = 1
+        else:
+          for_test_global_local += 1
+        n += for_test_global_local
+      return n
+
+    tr = self.transform(f, control_flow)
+    assert for_test_global_local is None
+    self.assertEqual(tr(1), 2)
+    self.assertEqual(for_test_global_local, 1)
+
   def test_no_outputs(self):
 
     def f(n):
diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md
index 70e3b3a552e..5459d67b883 100644
--- a/tensorflow/python/autograph/g3doc/reference/limitations.md
+++ b/tensorflow/python/autograph/g3doc/reference/limitations.md
@@ -284,7 +284,7 @@ A special case of hidden side effects are methods, which are commonly used
 to change the value of objects:
 
 ```
-def MyClass(object):
+class MyClass(object):
   def change(self):
     self.y += 1
 
@@ -308,7 +308,7 @@ temporary objects when executing eagerly, but their number is greatly reduced
 in `@tf.function`:
 
 ```
-def MyClass(object):
+class MyClass(object):
   def change(self):
     self.y += 1
     return self
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index a3228c0a1cc..dc50a4761ad 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -178,7 +178,8 @@ class Scope(object):
     self.isolated_names.update(other.isolated_names)
     self.read.update(other.read)
     self.modified.update(other.modified)
-    self.bound.update(other.deleted)
+    self.bound.update(other.bound)
+    self.deleted.update(other.deleted)
     self.annotations.update(other.annotations)
     self.params.update(other.params)
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index 3a1b552190a..ecf08011627 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -69,11 +69,25 @@ class ScopeTest(test.TestCase):
 
     self.assertMissing(QN('bar'), scope)
 
-    scope.modified.add(QN('bar'))
+  def test_merge_from(self):
+    scope = activity.Scope(None)
+    other = activity.Scope(None)
+
+    for col in (scope.modified, scope.read, scope.bound, scope.deleted):
+      col.add(QN('foo'))
+
+    for col in (other.modified, other.read, other.bound, other.deleted):
+      col.add(QN('foo'))
+      col.add(QN('bar'))
+
     scope.merge_from(other)
 
-    self.assertWriteOnly(QN('bar'), scope)
-    self.assertMissing(QN('bar'), other)
+    self.assertReadWrite(QN('foo'), scope)
+    self.assertReadWrite(QN('bar'), scope)
+    self.assertIn(QN('foo'), scope.bound)
+    self.assertIn(QN('bar'), scope.bound)
+    self.assertIn(QN('foo'), scope.deleted)
+    self.assertIn(QN('bar'), scope.deleted)
 
   def test_copy_of(self):
     scope = activity.Scope(None)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_inference.py b/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
index 9fc16480b32..b35b1d2c9d8 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_inference.py
@@ -75,8 +75,19 @@ class Resolver(object):
     """Resolves the type a literal or static value."""
     raise NotImplementedError('subclasses must implement')
 
-  def res_arg(self, ns, types_ns, f_name, name, type_anno):
-    """Resolves the type of a (possibly annotated) function argument."""
+  def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+    """Resolves the type of a (possibly annotated) function argument.
+
+    Args:
+      ns: namespace
+      types_ns: types namespace
+      f_name: str, the function name
+      name: str, the argument name
+      type_anno: the type annotating the argument, if any
+      f_is_local: bool, whether the function is a local function
+    Returns:
+      Set of the argument types.
+    """
     raise NotImplementedError('subclasses must implement')
 
   def res_call(self, ns, types_ns, node, f_type, args, keywords):
@@ -98,8 +109,9 @@ class Resolver(object):
     """
     raise NotImplementedError('subclasses must implement')
 
-  def res_subscript(self, ns, types_ns, node, value, slice_):
-    """Resolves the return type of a unary operation."""
+  # TODO(mdan): Clean this up.
+  def res_slice(self, ns, types_ns, node_or_slice, value, slice_):
+    """Resolves the return type of slice operation."""
     raise NotImplementedError('subclasses must implement')
 
   def res_compare(self, ns, types_ns, node, left, right):
@@ -217,7 +229,18 @@ class StmtInferrer(gast.NodeVisitor):
       return {Tuple}
 
     assert isinstance(node.ctx, gast.Store)
-    # TODO(mdan): Implement tuple unpacking.
+
+    if self.rtype is not None:
+      original_stype = self.rtype
+      # TODO(mdan): Find a better way to express unpacking.
+      i_type = self.resolver.res_value(self.namespace, 0)
+      for i, elt in enumerate(node.elts):
+        self.rtype = self.resolver.res_subscript(
+            self.namespace, self.types_in.types, i, original_stype, i_type)
+        self.visit(elt)
+      self.rtype = original_stype
+      return original_stype
+
     return None
 
   def visit_List(self, node):
@@ -249,9 +272,13 @@ class StmtInferrer(gast.NodeVisitor):
               anno.setanno(node, anno.Static.VALUE, value)
 
     elif isinstance(node.ctx, gast.Param):
+      # The direct parent it the whole function scope. See activity.py.
+      f_is_local = self.scope.parent.parent is not None
+
       type_name = anno.getanno(node.annotation, anno.Basic.QN, None)
       types = self.resolver.res_arg(self.namespace, self.types_in.types,
-                                    self.scope.function_name, name, type_name)
+                                    self.scope.function_name, name, type_name,
+                                    f_is_local)
       if types is not None:
         self.new_symbols[name] = types
 
@@ -317,8 +344,6 @@ class StmtInferrer(gast.NodeVisitor):
     if node.decorator_list:
       raise NotImplementedError('decorators: {}'.format(node.decorator_list))
 
-    # TODO(mdan): Use args.
-
     ret_types = None
     if node.returns:
       ret_types, _ = self.resolver.res_name(
@@ -371,7 +396,7 @@ class StmtInferrer(gast.NodeVisitor):
         ret_type, side_effects = None, None
       else:
         ret_type, side_effects = self._resolve_typed_callable(
-            self.types_in.types.get(f_name), arg_types, keyword_types)
+            f_type, arg_types, keyword_types)
 
     else:
       # Nonlocal function, resolve externally.
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
index 3a371588303..5648f8dcb62 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_inference_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from typing import Any, Callable
+from typing import Any, Callable, Tuple
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
@@ -43,7 +43,7 @@ class BasicTestResolver(type_inference.Resolver):
   def res_value(self, ns, value):
     return {type(value)}
 
-  def res_arg(self, ns, types_ns, f_name, name, type_anno):
+  def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
     if type_anno is None:
       return None
     return {str(type_anno)}
@@ -87,7 +87,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return None
 
     def test_fn(a, b):
@@ -106,7 +106,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return 1
 
     def test_fn(a):
@@ -122,7 +122,8 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        test_self.assertFalse(f_is_local)
         if name == qual_names.QN('a'):
           test_self.assertEqual(type_anno, qual_names.QN('int'))
         return {str(name) + '_type'}
@@ -138,19 +139,41 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
   def test_argument_of_local_function(self):
 
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        if f_name == 'test_fn':
+          test_self.assertFalse(f_is_local)
+          test_self.assertEqual(name, qual_names.QN('a'))
+          test_self.assertEqual(type_anno, qual_names.QN('int'))
+        elif f_name == 'foo':
+          test_self.assertTrue(f_is_local)
+          if name == qual_names.QN('x'):
+            test_self.assertEqual(type_anno, qual_names.QN('float'))
+          elif name == qual_names.QN('y'):
+            test_self.assertIsNone(type_anno)
+          else:
+            test_self.fail('unexpected argument {} for {}'.format(name, f_name))
+        else:
+          test_self.fail('unexpected function name {}'.format(f_name))
+        return {str(name) + '_type'}
+
     def test_fn(a: int):
 
-      def foo(x: float):
-        return x
+      def foo(x: float, y):
+        return x, y
 
-      return foo(a)
+      return foo(a, a)
 
-    tr = TestTranspiler(BasicTestResolver)
+    tr = TestTranspiler(Resolver)
     node, _ = tr.transform(test_fn, None)
     fn_body = node.body
 
-    self.assertTypes(fn_body[0].body[0].value, 'float')
-    self.assertClosureTypes(fn_body[0], {'a': {'int'}})
+    self.assertTypes(fn_body[0].body[0].value, Tuple)
+    self.assertTypes(fn_body[0].body[0].value.elts[0], 'x_type')
+    self.assertTypes(fn_body[0].body[0].value.elts[1], 'y_type')
 
   def test_assign_straightline(self):
 
@@ -434,7 +457,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
         test_self.assertEqual(name, qual_names.QN('g'))
         return None, g
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {str(type_anno)}
 
       def res_call(self, ns, types_ns, node, f_type, args, keywords):
@@ -591,7 +614,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
         test_self.assertEqual(value, 1.0)
         return {float}
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {str(type_anno)}
 
       def res_call(self, ns, types_ns, node, f_type, args, keywords):
@@ -627,7 +650,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {list}
 
       def res_value(self, ns, value):
@@ -648,13 +671,45 @@ class TypeInferenceAnalyzerTest(test.TestCase):
     self.assertTypes(fn_body[0].value.value, list)
     self.assertTypes(fn_body[0].value.slice.value, int)
 
+  def test_tuple_unpacking(self):
+
+    test_self = self
+
+    class Resolver(type_inference.Resolver):
+
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+        return {list}
+
+      def res_value(self, ns, value):
+        return {int}
+
+      def res_subscript(self, ns, types_ns, node_or_slice, value, slice_):
+        test_self.assertIn(node_or_slice, (0, 1))
+        test_self.assertSetEqual(value, {list})
+        test_self.assertSetEqual(slice_, {int})
+        if node_or_slice == 0:
+          return {float}
+        else:
+          return {str}
+
+    def test_fn(t):
+      a, b = t
+      return a, b
+
+    node, _ = TestTranspiler(Resolver).transform(test_fn, None)
+    fn_body = node.body
+
+    self.assertTypes(fn_body[1].value, Tuple)
+    self.assertTypes(fn_body[1].value.elts[0], float)
+    self.assertTypes(fn_body[1].value.elts[1], str)
+
   def test_compare(self):
 
     test_self = self
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {int}
 
       def res_compare(self, ns, types_ns, node, left, right):
@@ -678,7 +733,7 @@ class TypeInferenceAnalyzerTest(test.TestCase):
 
     class Resolver(type_inference.Resolver):
 
-      def res_arg(self, ns, types_ns, f_name, name, type_anno):
+      def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
         return {list}
 
       def res_binop(self, ns, types_ns, node, left, right):
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index cb960fd599a..3bb87cdd4d6 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -89,8 +89,7 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     input_names.push_back(key_string);
 
     inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back(),
-                        true /*convert_to_string*/);
+    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back());
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -383,7 +382,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   std::vector<Safe_TF_TensorPtr> input_vals_safe;
   for (PyObject* ndarray : input_ndarrays) {
     input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back(), true);
+    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back());
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 6bc8cb2084d..ac656d322c4 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -166,7 +166,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           return out_handle;
         });
   m.def("_TF_SetTarget", TF_SetTarget);
-  m.def("_TF_SetConfig", [](TF_SessionOptions* options, py::str proto) {
+  m.def("_TF_SetConfig", [](TF_SessionOptions* options, py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -398,7 +398,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   });
 
   m.def("SetHandleShapeAndType",
-        [](TF_Graph* graph, TF_Output output, py::str proto) {
+        [](TF_Graph* graph, TF_Output output, py::bytes proto) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           tensorflow::Safe_TF_BufferPtr buf =
@@ -614,7 +614,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         });
 
   m.def("TF_SetAttrValueProto", [](TF_OperationDescription* desc,
-                                   const char* attr_name, py::str proto) {
+                                   const char* attr_name, py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -673,7 +673,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   m.def("TF_DeleteBuffer", &TF_DeleteBuffer);
   m.def(
       "TF_NewBufferFromString",
-      [](py::str buffer_as_string) {
+      [](py::bytes buffer_as_string) {
         tensorflow::Safe_TF_BufferPtr buf = tensorflow::make_safe(
             ProtoStringToTFBuffer(buffer_as_string.ptr()));
         return TF_NewBufferFromString(buf.get()->data, buf.get()->length);
@@ -853,7 +853,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         py::call_guard<py::gil_scoped_release>());
 
   m.def("TF_FunctionSetAttrValueProto",
-        [](TF_Function* func, const char* attr_name, py::str proto) {
+        [](TF_Function* func, const char* attr_name, py::bytes proto) {
           tensorflow::Safe_TF_StatusPtr status =
               tensorflow::make_safe(TF_NewStatus());
           tensorflow::Safe_TF_BufferPtr buf =
@@ -887,7 +887,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_FunctionImportFunctionDef",
-      [](py::str proto) {
+      [](py::bytes proto) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         tensorflow::Safe_TF_BufferPtr buf =
@@ -991,7 +991,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def(
       "TF_NewServer",
-      [](py::str proto) {
+      [](py::bytes proto) {
         tensorflow::Safe_TF_StatusPtr status =
             tensorflow::make_safe(TF_NewStatus());
         tensorflow::Safe_TF_BufferPtr buf =
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 6d12e1071ed..aae1de594de 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 8, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 8, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/mlir/mlir.py b/tensorflow/python/compiler/mlir/mlir.py
index 84d23c30f00..fd9918d19f8 100644
--- a/tensorflow/python/compiler/mlir/mlir.py
+++ b/tensorflow/python/compiler/mlir/mlir.py
@@ -31,7 +31,7 @@ def convert_graph_def(graph_def, pass_pipeline='tf-standard-pipeline'):
       representation of a valid GraphDef.
     pass_pipeline: A textual description of an MLIR Pass Pipeline to run on the
       module, see MLIR documentation for the
-      [textual pass pipeline syntax](https://github.com/tensorflow/mlir/blob/master/g3doc/WritingAPass.md#textual-pass-pipeline-specification).
+      [textual pass pipeline syntax](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
 
   Returns:
     A textual representation of the MLIR module corresponding to the graphdef.
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index a674feeb5a3..387d379e601 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -145,7 +145,9 @@ cuda_py_tests(
     ],
     python_version = "PY3",
     tags = [
+        "no_cuda11",  # TODO(b/165611343): Need to address the failures.
         "no_cuda_on_cpu_tap",
+        "no_oss",
         "no_rocm",
         "no_windows",
         "nomac",
diff --git a/tensorflow/python/compiler/xla/jit.py b/tensorflow/python/compiler/xla/jit.py
index 3ccf2959b76..fb911ec9637 100644
--- a/tensorflow/python/compiler/xla/jit.py
+++ b/tensorflow/python/compiler/xla/jit.py
@@ -70,6 +70,34 @@ def experimental_jit_scope(compile_ops=True, separate_compiled_gradients=False):
     h = tf.gradients([f], [a, b], name='mygrads2')
     ```
 
+  Ops that are not in the scope may be clustered and compiled with ops in
+  the scope with `compile_ops=True`, while the ops in the scope with
+  `compile_ops=False` will never be compiled.
+
+  For example:
+
+    ```python
+    # In the example below, x and loss may be clustered and compiled together,
+    # while y will not be compiled.
+    with tf.xla.experimental.jit_scope():
+      x = tf.matmul(a, b)
+    with tf.xla.experimental.jit_scope(compile_ops=False):
+      y = tf.matmul(c, d)
+    loss = x + y
+    ```
+
+  If you want to only compile the ops in the scope with `compile_ops=True`,
+  consider adding an outer `jit_scope(compile_ops=False)`:
+
+    ```python
+    # In the example below, only x will be compiled.
+    with tf.xla.experimental.jit_scope(compile_ops=False):
+      with tf.xla.experimental.jit_scope():
+        x = tf.matmul(a, b)
+      y = tf.matmul(c, d)
+      loss = x + y
+    ```
+
   Args:
     compile_ops: Whether to enable or disable compilation in the scope.
       Either a Python bool, or a callable that accepts the parameter
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 1b1a72af8d3..564dda0cf11 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -252,6 +252,23 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(sharding_policy=[
+              distribute_options.AutoShardPolicy.DATA,
+              distribute_options.AutoShardPolicy.AUTO
+          ])))
+  def testShardByDataBeforePrefetch(self, sharding_policy):
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = dataset.apply(testing.assert_next(["Shard", "Prefetch"]))
+    dataset = dataset.prefetch(1)
+    options = dataset_ops.Options()
+    options.experimental_distribute.auto_shard_policy = sharding_policy
+    dataset = dataset.with_options(options)
+    dataset = distribute._AutoShardDataset(dataset, 2, 0)
+    self.assertDatasetProduces(dataset, [0, 2])
+
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
@@ -544,11 +561,13 @@ class AutoShardWithRebatchDatasetTest(
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
-          combinations.combine(sharding_policy=[
-              distribute_options.AutoShardPolicy.DATA,
-              distribute_options.AutoShardPolicy.AUTO
-          ])))
-  def testUseLegacyRebatchWithDataSharding(self, sharding_policy):
+          combinations.times(
+              combinations.combine(sharding_policy=[
+                  distribute_options.AutoShardPolicy.DATA,
+                  distribute_options.AutoShardPolicy.AUTO
+              ]), combinations.combine(with_prefetch=[True, False]))))
+  def testUseLegacyRebatchWithDataSharding(self, sharding_policy,
+                                           with_prefetch):
     # This test simulates a distributed environment with 3 workers, each with
     # 1 replica.
     dataset = dataset_ops.Dataset.range(8)
@@ -561,6 +580,8 @@ class AutoShardWithRebatchDatasetTest(
     # of the dataset.
     worker_a_dataset = distribute._RebatchDataset(
         dataset, batch_sizes=[2, 1, 1])
+    if with_prefetch:
+      worker_a_dataset = worker_a_dataset.prefetch(1)
     worker_a_dataset = distribute._AutoShardDataset(
         worker_a_dataset, 3, 0, num_replicas=3)
     expected = [[0, 1], [4, 5]]
@@ -568,6 +589,8 @@ class AutoShardWithRebatchDatasetTest(
 
     worker_b_dataset = distribute._RebatchDataset(
         dataset, batch_sizes=[1, 1, 2])
+    if with_prefetch:
+      worker_b_dataset = worker_b_dataset.prefetch(1)
     worker_b_dataset = distribute._AutoShardDataset(
         worker_b_dataset, 3, 1, num_replicas=3)
     expected = [[2, 3], [6, 7]]
@@ -575,6 +598,8 @@ class AutoShardWithRebatchDatasetTest(
 
     worker_c_dataset = distribute._RebatchDataset(
         dataset, batch_sizes=[1, 2, 1])
+    if with_prefetch:
+      worker_c_dataset = worker_c_dataset.prefetch(1)
     worker_c_dataset = distribute._AutoShardDataset(
         worker_c_dataset, 3, 2, num_replicas=3)
     expected = [[], []]
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
index e26e97dbd97..16bb1ec9cd7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import warnings
 
 from absl.testing import parameterized
@@ -185,6 +186,21 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[[0]])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testOptimizationDisableIntraOpParallelism(self):
+    os.environ["TF_DATA_EXPERIMENT_OPT_IN"] = "disable_intra_op_parallelism"
+    os.environ["TF_JOB_NAME"] = "test_job"
+
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x+1)
+    dataset = dataset.apply(testing.assert_next(["MaxIntraOpParallelism"]))
+
+    options = dataset_ops.Options()
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, expected_output=list(range(1, 11)))
+
+    del os.environ["TF_DATA_EXPERIMENT_OPT_IN"]
+    del os.environ["TF_JOB_NAME"]
+
   @combinations.generate(test_base.default_test_combinations())
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index b6fc337db61..dc7dd61679e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -314,6 +314,25 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
         num_runs_per_fingerprint=1,
         num_snapshot_shards_per_run=multiprocessing.cpu_count())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotShuffleSameFingerprint(self):
+
+    def make_dataset():
+      dataset = dataset_ops.Dataset.range(1000)
+      dataset = dataset.shuffle(1000)
+      dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+      return dataset
+
+    dataset1 = make_dataset()
+    self.assertDatasetProducesSet(dataset1, list(range(1000)))
+    dataset2 = make_dataset()
+    self.assertDatasetProducesSet(dataset2, list(range(1000)))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
 
 class LegacySnapshotDatasetTest(
     reader_dataset_ops_test_base.TFRecordDatasetTestBase,
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
index 12c1903fe22..9eaeb9b7722 100644
--- a/tensorflow/python/data/experimental/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -92,17 +92,18 @@ class DispatchServer(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         creating the TensorFlow server.
     """
-    self._protocol = protocol or DEFAULT_PROTOCOL
-    work_dir = work_dir or ""
-    fault_tolerant_mode = fault_tolerant_mode or False
-    if fault_tolerant_mode and not work_dir:
+    self._protocol = DEFAULT_PROTOCOL if protocol is None else protocol
+    self._work_dir = "" if work_dir is None else work_dir
+    self._fault_tolerant_mode = (False if fault_tolerant_mode is None else
+                                 fault_tolerant_mode)
+    if self._fault_tolerant_mode and not self._work_dir:
       raise ValueError(
           "Cannot enable fault tolerant mode without configuring a work_dir")
     config = service_config_pb2.DispatcherConfig(
         port=port,
         protocol=self._protocol,
-        work_dir=work_dir,
-        fault_tolerant_mode=fault_tolerant_mode)
+        work_dir=self._work_dir,
+        fault_tolerant_mode=self._fault_tolerant_mode)
     self._server = _pywrap_server_lib.TF_DATA_NewDispatchServer(
         config.SerializeToString())
     if start:
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index f59c1fb90bf..b268ba2403a 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -34,7 +34,8 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
                                                        "DispatchGrpcDataServer")
       .def("start", &tensorflow::data::DispatchGrpcDataServer::Start)
       .def("stop", &tensorflow::data::DispatchGrpcDataServer::Stop)
-      .def("join", &tensorflow::data::DispatchGrpcDataServer::Join)
+      .def("join", &tensorflow::data::DispatchGrpcDataServer::Join,
+           py::call_guard<py::gil_scoped_release>())
       .def("bound_port", &tensorflow::data::DispatchGrpcDataServer::BoundPort)
       .def("num_workers",
            [](tensorflow::data::DispatchGrpcDataServer* server) -> int {
@@ -47,7 +48,8 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
   py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer")
       .def("start", &tensorflow::data::WorkerGrpcDataServer::Start)
       .def("stop", &tensorflow::data::WorkerGrpcDataServer::Stop)
-      .def("join", &tensorflow::data::WorkerGrpcDataServer::Join)
+      .def("join", &tensorflow::data::WorkerGrpcDataServer::Join,
+           py::call_guard<py::gil_scoped_release>())
       .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort);
 
   m.def(
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 210b6f59681..43f3a297da8 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -94,6 +94,7 @@ tf_py_test(
     name = "data_service_ops_test",
     size = "medium",
     srcs = ["data_service_ops_test.py"],
+    shard_count = 10,
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 6ef9293ddd7..310a60b8114 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
@@ -41,6 +42,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
@@ -63,6 +65,14 @@ def _make_distributed_dataset(dataset,
           task_refresh_interval_hint_ms=20))
 
 
+def _all_cluster_configurations():
+  with_work_dir = combinations.combine(
+      work_dir=None, fault_tolerant_mode=[True, False])
+  without_work_dir = combinations.combine(
+      work_dir="", fault_tolerant_mode=False)
+  return with_work_dir + without_work_dir
+
+
 def _make_distributed_range_dataset(num_elements,
                                     dispatcher,
                                     job_name=None,
@@ -87,13 +97,20 @@ def _make_distributed_range_dataset(num_elements,
 
 class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  def start_dispatch_server(self, port=0):
-    work_dir = os.path.join(self.get_temp_dir(), "work_dir")
+  def start_dispatch_server(self,
+                            name="",
+                            port=0,
+                            work_dir=None,
+                            fault_tolerant_mode=True):
+    # If a test starts multiple independent dispatch servers, it should give
+    # them different `name` values.
+    work_dir = os.path.join(self.get_temp_dir(), "work_dir_",
+                            name) if work_dir is None else work_dir
     return server_lib.DispatchServer(
         port=port,
         protocol=server_lib.DEFAULT_PROTOCOL,
         work_dir=work_dir,
-        fault_tolerant_mode=True)
+        fault_tolerant_mode=fault_tolerant_mode)
 
   def start_worker_server(self, dispatcher, port=0):
     return server_lib.WorkerServer(
@@ -105,7 +122,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     """Stops `dispatcher` and returns a new dispatcher with the same port."""
     port = int(_address_from_target(dispatcher.target).split(":")[1])
     dispatcher._stop()
-    return self.start_dispatch_server(port=port)
+    return self.start_dispatch_server(
+        port=port,
+        work_dir=dispatcher._work_dir,
+        fault_tolerant_mode=dispatcher._fault_tolerant_mode)
 
   def restart_worker(self, worker, dispatcher, use_same_port=True):
     """Stops `worker` and returns a new worker."""
@@ -115,22 +135,25 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     worker._stop()
     return self.start_worker_server(dispatcher, port)
 
-  def start_cluster(self, num_workers):
-    """Creates a cluster of tf.data service servers.
+  def start_cluster(self,
+                    num_workers,
+                    name="",
+                    work_dir=None,
+                    fault_tolerant_mode=True):
+    """Creates and starts a tf.data service cluster."""
+    dispatcher = self.start_dispatch_server(
+        name=name, work_dir=work_dir, fault_tolerant_mode=fault_tolerant_mode)
+    workers = [self.start_worker_server(dispatcher) for _ in range(num_workers)]
+    return dispatcher, workers
 
-    Args:
-      num_workers: The number of workers in the cluster.
-
-    Returns:
-      A tuple of (dispatcher, list_of_workers).
-    """
-    dispatcher = self.start_dispatch_server()
-    servers = [self.start_worker_server(dispatcher) for _ in range(num_workers)]
-    return dispatcher, servers
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDistributeBasic(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         _all_cluster_configurations()))
+  def testDistributeBasic(self, work_dir, fault_tolerant_mode):
+    dispatcher, workers = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
+        1,
+        work_dir=work_dir,
+        fault_tolerant_mode=fault_tolerant_mode)
     num_elements = 10
     ds = _make_distributed_range_dataset(10, dispatcher)
     results = [elem.numpy() for elem in ds]
@@ -382,9 +405,11 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.eager_only_combinations(),
-                         combinations.combine(use_same_port=[True, False])))
-  def testRestartWorker(self, use_same_port):
-    dispatcher, [worker] = self.start_cluster(1)
+                         combinations.combine(use_same_port=[True, False]),
+                         _all_cluster_configurations()))
+  def testRestartWorker(self, use_same_port, work_dir, fault_tolerant_mode):
+    dispatcher, [worker] = self.start_cluster(
+        1, work_dir=work_dir, fault_tolerant_mode=fault_tolerant_mode)
     num_elements = 100
     ds = _make_distributed_range_dataset(num_elements, dispatcher)
     iterator = iter(ds)
@@ -444,9 +469,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSharedJobName(self):
     dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
     num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+
+    def make_ds():
+      return dataset_ops.Dataset.range(num_elements).shuffle(num_elements)
+
+    ds1 = _make_distributed_dataset(make_ds(), dispatcher, job_name="job_name")
+    ds2 = _make_distributed_dataset(make_ds(), dispatcher, job_name="job_name")
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
@@ -692,6 +720,40 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
     self.assertNotEqual(id_1.numpy(), id_2.numpy())
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testTwoLevelDistribute(self):
+    cluster_1_size = 3
+    dispatcher_1, workers_1 = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
+        cluster_1_size,
+        name="cluster_1")
+    dispatcher_2, workers_2 = self.start_cluster(1, name="cluster_2")  # to avoid gcing workers, pylint: disable=unused-variable
+    num_sizes = 10
+    size_repeats = 5
+    strings = ["a" * i for i in range(num_sizes)] * size_repeats
+    ds = dataset_ops.Dataset.from_tensor_slices(strings)
+    ds = ds.shuffle(len(strings))
+    ds = _make_distributed_dataset(ds, dispatcher_1)
+    # Large enough so that all strings of the same size are windowed together.
+    window_size = cluster_1_size * size_repeats
+    batch_size = size_repeats
+
+    def key_func(x):
+      return math_ops.cast(string_ops.string_length_v2(x), dtypes.int64)
+
+    ds = ds.apply(
+        grouping.group_by_window(
+            key_func=key_func,
+            reduce_func=lambda _, x: x.batch(batch_size),
+            window_size=window_size))
+    ds = _make_distributed_dataset(ds, dispatcher_2)
+
+    it = iter(ds)
+    for _ in range(num_sizes):
+      element = next(it).numpy()
+      for _ in range(1, cluster_1_size):
+        self.assertAllEqual(next(it).numpy(), element)
+    self.assertEmpty(list(it))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index 275be3ea635..f179ba3c359 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -56,6 +56,8 @@ from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 def _test_combinations_with_mode_v1(mode):
@@ -1380,6 +1382,23 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = apply_map(dataset, map_function)
     self.assertDatasetProduces(dataset, expected_output=[21])
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testCheckpointLargeBuffer(self):
+    # Tensor of size 100M
+    dataset = dataset_ops.Dataset.from_tensors(
+        array_ops.ones((25, 1000, 1000), dtype=dtypes.float32))
+    # Repeat 25 times to exceed the 2G proto limit
+    dataset = dataset.repeat(30)
+    dataset = dataset.map(lambda x: x * 2, num_parallel_calls=25)
+
+    iterator = iter(dataset)
+    # Call next() to trigger parallel map calls.
+    next(iterator)
+    ckpt = trackable_utils.Checkpoint(iterator=iterator)
+    manager = checkpoint_management.CheckpointManager(
+        ckpt, self.get_temp_dir(), max_to_keep=1)
+    manager.save()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index 6869306e0d6..31220c69d9e 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import platform
+import sys
+
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization_options
@@ -51,25 +54,31 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(options, ds.options())
 
   @combinations.generate(test_base.default_test_combinations())
-  def testOptionsTwiceDifferent(self):
+  def testOptionsTwiceDifferentOptions(self):
     options1 = dataset_ops.Options()
     options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_deterministic = False
-    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
-        options2)
+    ds = dataset_ops.Dataset.range(0)
+    ds = ds.with_options(options1)
+    ds = ds.with_options(options2)
     self.assertTrue(ds.options().experimental_optimization.autotune)
     # Explicitly check that flag is False since assertFalse allows None
     self.assertIs(ds.options().experimental_deterministic, False)
 
   @combinations.generate(test_base.default_test_combinations())
-  def testOptionsTwiceDifferentError(self):
+  def testOptionsTwiceSameOption(self):
+    if sys.version_info >= (3, 8) and platform.system() == "Windows":
+      # TODO(b/165013260): Fix this
+      self.skipTest("Test is currently broken on Windows with Python 3.8")
     options1 = dataset_ops.Options()
-    options1.experimental_optimization.autotune = True
+    options1.experimental_optimization.autotune = False
     options2 = dataset_ops.Options()
-    options2.experimental_optimization.autotune = False
-    with self.assertRaisesRegex(ValueError, "Cannot merge incompatible values"):
-      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
+    options2.experimental_optimization.autotune = True
+    ds = dataset_ops.Dataset.range(0)
+    ds = ds.with_options(options1)
+    ds = ds.with_options(options2)
+    self.assertTrue(ds.options().experimental_optimization.autotune)
 
   @combinations.generate(test_base.default_test_combinations())
   def testOptionsMergeOptionsFromMultipleInputs(self):
@@ -77,9 +86,9 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options1.experimental_optimization.autotune = True
     options2 = dataset_ops.Options()
     options2.experimental_deterministic = True
-    ds = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(0).with_options(options1),
-         dataset_ops.Dataset.range(0).with_options(options2)))
+    ds1 = dataset_ops.Dataset.range(0).with_options(options1)
+    ds2 = dataset_ops.Dataset.range(0).with_options(options2)
+    ds = dataset_ops.Dataset.zip((ds1, ds2))
     self.assertTrue(ds.options().experimental_optimization.autotune)
     self.assertTrue(ds.options().experimental_deterministic)
 
@@ -99,6 +108,16 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(options1.experimental_threading,
                      threading_options.ThreadingOptions())
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testMutableOptions(self):
+    ds = dataset_ops.Dataset.range(0)
+    ds.options().experimental_optimization.autotune = True
+    self.assertTrue(ds.options().experimental_optimization.autotune)
+    options = dataset_ops.Options()
+    ds = ds.with_options(options)
+    ds.options().experimental_deterministic = True
+    self.assertTrue(ds.options().experimental_deterministic)
+
   @combinations.generate(test_base.eager_only_combinations())
   def testNestedDataset(self):
     ds = dataset_ops.Dataset.from_tensors(0)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 270c65d0743..a2f96267be2 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -30,7 +30,6 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import tf2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
@@ -2836,20 +2835,37 @@ def get_legacy_output_types(dataset_or_iterator):
 
 @tf_export("data.Options")
 class Options(options_lib.OptionsBase):
-  """Represents options for tf.data.Dataset.
+  """Represents options for `tf.data.Dataset`.
 
-  An `Options` object can be, for instance, used to control which graph
-  optimizations to apply or whether to use performance modeling to dynamically
-  tune the parallelism of operations such as `tf.data.Dataset.map` or
-  `tf.data.Dataset.interleave`.
+  A `tf.data.Options` object can be, for instance, used to control which static
+  optimizations to apply to the input pipeline graph or whether to use
+  performance modeling to dynamically tune the parallelism of operations such as
+  `tf.data.Dataset.map` or `tf.data.Dataset.interleave`.
 
-  After constructing an `Options` object, use `dataset.with_options(options)` to
-  apply the options to a dataset.
+  The options are set for the entire dataset and are carried over to datasets
+  created through tf.data transformations.
 
-  >>> dataset = tf.data.Dataset.range(3)
+  The options can be set either by mutating the object returned by
+  `tf.data.Dataset.options()` or by constructing an `Options` object and using
+  the `tf.data.Dataset.with_options(options)` transformation, which returns a
+  dataset with the options set.
+
+  >>> dataset = tf.data.Dataset.range(42)
+  >>> dataset.options().experimental_deterministic = False
+  >>> print(dataset.options().experimental_deterministic)
+  False
+
+  >>> dataset = tf.data.Dataset.range(42)
   >>> options = tf.data.Options()
-  >>> # Set options here.
+  >>> options.experimental_deterministic = False
   >>> dataset = dataset.with_options(options)
+  >>> print(dataset.options().experimental_deterministic)
+  False
+
+  Note: A known limitation of the `tf.data.Options` implementation is that the
+  options are not preserved across tf.function boundaries. In particular, to
+  set options for a dataset that is iterated within a tf.function, the options
+  need to be set within the same tf.function.
   """
 
   experimental_deterministic = options_lib.create_option(
@@ -2968,17 +2984,15 @@ class Options(options_lib.OptionsBase):
   def merge(self, options):
     """Merges itself with the given `tf.data.Options`.
 
-    The given `tf.data.Options` can be merged as long as there does not exist an
-    attribute that is set to different values in `self` and `options`.
+    If this object and the `options` to merge set an option differently, a
+    warning is generated and this object's value is updated with the `options`
+    object's value.
 
     Args:
       options: a `tf.data.Options` to merge with
 
-    Raises:
-      ValueError: if the given `tf.data.Options` cannot be merged
-
     Returns:
-      New `tf.data.Options()` object which is the result of merging self with
+      New `tf.data.Options` object which is the result of merging self with
       the input `tf.data.Options`.
     """
     return options_lib.merge_options(self, options)
@@ -4438,45 +4452,30 @@ class _OptimizeDataset(UnaryUnchangedStructureDataset):
     if optimization_configs is None:
       optimization_configs = []
 
-    if compat.forward_compatible(2020, 8, 6):
-      self._optimizations_enabled = convert.optional_param_to_tensor(
-          argument_name="optimizations_enabled",
-          argument_value=optimizations_enabled,
-          argument_default=[],
-          argument_dtype=dtypes.string)
-      self._optimizations_disabled = convert.optional_param_to_tensor(
-          argument_name="optimizations_disabled",
-          argument_value=optimizations_disabled,
-          argument_default=[],
-          argument_dtype=dtypes.string)
-      self._optimizations_default = convert.optional_param_to_tensor(
-          argument_name="optimizations_default",
-          argument_value=optimizations_default,
-          argument_default=[],
-          argument_dtype=dtypes.string)
+    self._optimizations_enabled = convert.optional_param_to_tensor(
+        argument_name="optimizations_enabled",
+        argument_value=optimizations_enabled,
+        argument_default=[],
+        argument_dtype=dtypes.string)
+    self._optimizations_disabled = convert.optional_param_to_tensor(
+        argument_name="optimizations_disabled",
+        argument_value=optimizations_disabled,
+        argument_default=[],
+        argument_dtype=dtypes.string)
+    self._optimizations_default = convert.optional_param_to_tensor(
+        argument_name="optimizations_default",
+        argument_value=optimizations_default,
+        argument_default=[],
+        argument_dtype=dtypes.string)
 
-      variant_tensor = gen_dataset_ops.optimize_dataset_v2(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._optimizations_enabled,
-          self._optimizations_disabled,
-          self._optimizations_default,
-          optimization_configs=optimization_configs,
-          **self._flat_structure)
-    else:
-      if optimizations_enabled is None:
-        optimizations_enabled = []
-      if optimizations_default is None:
-        optimizations_default = []
+    variant_tensor = gen_dataset_ops.optimize_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._optimizations_enabled,
+        self._optimizations_disabled,
+        self._optimizations_default,
+        optimization_configs=optimization_configs,
+        **self._flat_structure)
 
-      self._optimizations = ops.convert_to_tensor(
-          optimizations_enabled + optimizations_default,
-          dtype=dtypes.string,
-          name="optimizations")
-      variant_tensor = gen_dataset_ops.optimize_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          self._optimizations,
-          optimization_configs=optimization_configs,
-          **self._flat_structure)
     super(_OptimizeDataset, self).__init__(input_dataset, variant_tensor)
 
 
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
index 781ae6403fa..8af773ed68b 100644
--- a/tensorflow/python/data/util/options.py
+++ b/tensorflow/python/data/util/options.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import collections
 
+from absl import logging
+
 
 def _internal_attr_name(name):
   return "_" + name
@@ -98,23 +100,23 @@ def merge_options(*options_list):
   """Merges the given options, returning the result as a new options object.
 
   The input arguments are expected to have a matching type that derives from
-  `OptionsBase` (and thus each represent a set of options). The method outputs
-  an object of the same type created by merging the sets of options represented
-  by the input arguments.
+  `tf.data.OptionsBase` (and thus each represent a set of options). The method
+  outputs an object of the same type created by merging the sets of options
+  represented by the input arguments.
 
-  The sets of options can be merged as long as there does not exist an option
-  with different non-default values.
+  If an option is set to different values by different options objects, the
+  result will match the setting of the options object that appears in the input
+  list last.
 
-  If an option is an instance of `OptionsBase` itself, then this method is
-  applied recursively to the set of options represented by this option.
+  If an option is an instance of `tf.data.OptionsBase` itself, then this method
+  is applied recursively to the set of options represented by this option.
 
   Args:
     *options_list: options to merge
 
   Raises:
     TypeError: if the input arguments are incompatible or not derived from
-      `OptionsBase`
-    ValueError: if the given options cannot be merged
+      `tf.data.OptionsBase`
 
   Returns:
     A new options object which is the result of merging the given options.
@@ -134,7 +136,7 @@ def merge_options(*options_list):
   default_options = result_type()
   result = result_type()
   for options in options_list:
-    # Iterate over all set options and merge the into the result.
+    # Iterate over all set options and merge them into the result.
     for name in options._options:  # pylint: disable=protected-access
       this = getattr(result, name)
       that = getattr(options, name)
@@ -146,7 +148,7 @@ def merge_options(*options_list):
       elif isinstance(this, OptionsBase):
         setattr(result, name, merge_options(this, that))
       elif this != that:
-        raise ValueError(
-            "Cannot merge incompatible values (%r and %r) of option: %s" %
-            (this, that, name))
+        logging.warning("Changing the value of option %s from %r to %r.", name,
+                        this, that)
+        setattr(result, name, that)
   return result
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 8497c4da8a7..dcf6b6b30fc 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -870,6 +870,7 @@ py_library(
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":multi_process_runner",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
@@ -879,12 +880,22 @@ py_library(
         "//tensorflow/python:session",
         "//tensorflow/python:training_lib",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
         "//third_party/py/numpy",
     ],
 )
 
+tf_py_test(
+    name = "multi_worker_test_base_test",
+    srcs = ["multi_worker_test_base_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":multi_worker_test_base",
+    ],
+)
+
 cuda_py_test(
     name = "checkpoint_utils_test",
     size = "medium",
@@ -1136,6 +1147,7 @@ distribute_py_test(
         ":distribute_utils",
         ":packed_distributed_variable",
         ":parameter_server_strategy",
+        ":ps_values",
         ":strategy_combinations",
         ":test_util",
         ":tpu_strategy",
@@ -1792,3 +1804,17 @@ py_library(
         "//tensorflow/python/distribute:values",
     ],
 )
+
+tf_py_test(
+    name = "parameter_server_strategy_v2_test",
+    srcs = ["parameter_server_strategy_v2_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":multi_worker_test_base",
+        ":parameter_server_strategy_v2",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/python/distribute/client/BUILD b/tensorflow/python/distribute/client/BUILD
index 35cd8d06282..9574f327075 100644
--- a/tensorflow/python/distribute/client/BUILD
+++ b/tensorflow/python/distribute/client/BUILD
@@ -13,6 +13,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client",
+        ":utils",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
     ],
 )
@@ -81,6 +82,28 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "parameter_server_client_mpr_test",
+    srcs = ["parameter_server_client_mpr_test.py"],
+    python_version = "PY3",
+    shard_count = 2,
+    tags = ["no_oss"],  # TODO(b/162119374)
+    deps = [
+        ":parameter_server_client",
+        ":remote_eager_lib",
+        ":utils",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 py_library(
     name = "metric_utils",
     srcs = ["metric_utils.py"],
@@ -104,6 +127,19 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/tfx/users/apps_itemsuggest:__subpackages__",
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
 py_library(
     name = "remote_eager_lib",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/distribute/client/client.py b/tensorflow/python/distribute/client/client.py
index 37f000d4a87..ac785b2a828 100644
--- a/tensorflow/python/distribute/client/client.py
+++ b/tensorflow/python/distribute/client/client.py
@@ -26,13 +26,12 @@ import contextlib
 import enum
 import functools
 import os
+import re
 import sys
 import threading
 import weakref
 from absl import logging
 from six.moves import queue
-
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute.client import metric_utils
@@ -42,8 +41,6 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import executor
 from tensorflow.python.eager import function as tf_function
 from tensorflow.python.eager import remote
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
@@ -357,7 +354,7 @@ class _CoordinatedClosureQueue(object):
   This class is thread-safe.
   """
 
-  def __init__(self, cancellation_mgr):
+  def __init__(self):
     # `self._inflight_closure_count` only tracks the number of inflight closures
     # that are "in generation". Once an error occurs, error generation is
     # incremented and all subsequent arriving closures (from inflight) are
@@ -383,7 +380,7 @@ class _CoordinatedClosureQueue(object):
     self._no_inflight_closure_condition = threading.Condition(self._queue_lock)
 
     # Use to cancel in-flight closures.
-    self._cancellation_mgr = cancellation_mgr
+    self._cancellation_mgr = cancellation.CancellationManager()
 
     if _CLOSURE_QUEUE_MAX_SIZE <= 0:
       logging.warning(
@@ -420,6 +417,14 @@ class _CoordinatedClosureQueue(object):
         closure._set_output_remote_values_cancelled()  # pylint: disable=protected-access
       except queue.Empty:
         break
+    # The cancellation manager cannot be reused once cancelled. After all
+    # closures (queued or inflight) are cleaned up, recreate the cancellation
+    # manager with clean state.
+    # Note on thread-safety: this is triggered when one of theses client APIs
+    # are called: `schedule`, `wait`, and `done`. At the same time, no new
+    # closures can be constructed (which reads the _cancellation_mgr to get
+    # cancellable functions).
+    self._cancellation_mgr = cancellation.CancellationManager()
 
   def _raise_if_error(self):
     """Raises the error if one exists.
@@ -430,6 +435,8 @@ class _CoordinatedClosureQueue(object):
     This method expects self._queue_lock to be held prior to entry.
     """
     if self._error:
+      logging.error("Start cancelling closures due to error %r: %s",
+                    self._error, self._error)
       self._cancel_all_closures()
       try:
         raise self._error  # pylint: disable=raising-bad-type
@@ -536,8 +543,9 @@ class _CoordinatedClosureQueue(object):
 class WorkerPreemptionHandler(object):
   """Handles worker preemptions."""
 
-  def __init__(self, server_def):
+  def __init__(self, server_def, cluster):
     self._server_def = server_def
+    self._cluster = cluster
     self._cluster_update_lock = threading.Lock()
     self._cluster_due_for_update = threading.Event()
     self._worker_up_cond = threading.Condition(self._cluster_update_lock)
@@ -571,6 +579,13 @@ class WorkerPreemptionHandler(object):
     try:
       yield
     except errors.OpError as e:
+      # If the error is due to temporary connectivity issues between worker and
+      # ps, put back closure, ignore error and do not mark worker as failure.
+      if self._cluster._record_and_ignore_transient_ps_failure(e):  # pylint: disable=protected-access
+        if on_failure_fn:
+          on_failure_fn()
+        return
+
       self._validate_preemption_failure(e)
       logging.error("Worker %s failed with error: %s", worker_device_name, e)
       if on_failure_fn:
@@ -668,9 +683,11 @@ class Worker(object):
           closure._fetch_output_remote_values()  # pylint: disable=protected-access
         self._cluster._closure_queue.mark_finished()  # pylint: disable=protected-access
     except Exception as e:  # pylint: disable=broad-except
-      logging.error(
-          "/job:worker/task:%d encountered the following error when processing "
-          "closure: %r:%s", self.worker_index, e, e)
+      # Avoid logging the derived cancellation error
+      if not isinstance(e, errors.CancelledError):
+        logging.error(
+            "/job:worker/task:%d encountered the following error when "
+            "processing closure: %r:%s", self.worker_index, e, e)
       nest.map_structure(
           lambda x: x._set_error(e),  # pylint: disable=protected-access
           closure._output_remote_values)  # pylint: disable=protected-access
@@ -699,7 +716,10 @@ class Worker(object):
     # status, and executing closures happen on the same thread. This allows us
     # to have simpler logic of concurrency.
     closure = Closure(
-        function, self._cluster._cancellation_mgr, args=args, kwargs=kwargs)  # pylint: disable=protected-access
+        function,
+        self._cluster._closure_queue._cancellation_mgr,  # pylint: disable=protected-access
+        args=args,
+        kwargs=kwargs)
     resource_remote_value = closure._output_remote_values  # pylint: disable=protected-access
     self._register_resource(resource_remote_value)
 
@@ -764,9 +784,25 @@ class Cluster(object):
                               protocol=cluster_resolver.rpc_layer,
                               cluster_device_filters=device_filters)
 
-    self._cancellation_mgr = cancellation.CancellationManager()
-    self._closure_queue = _CoordinatedClosureQueue(self._cancellation_mgr)
-    self.failure_handler = WorkerPreemptionHandler(context.get_server_def())
+    # Ignore PS failures reported by workers due to transient connection errors.
+    # Transient connectivity issues between workers and PS are relayed by the
+    # workers to the client, leading the client to believe that there are PS
+    # failures. The difference between transient vs. permanent PS failure is the
+    # number of reports from the workers. When this env var is set to a positive
+    # integer K, the client ignores up to K reports of a failed PS task. I.e.,
+    # only when there are more than K trials of executing closures fail due to
+    # errors from the same PS instance do we consider the PS instance encounters
+    # a failure.
+    # TODO(b/164279603): Remove this workaround when the underlying connectivity
+    # issue in gRPC server is resolved.
+    self._transient_ps_failures_threshold = int(os.environ.get(
+        "TF_CLIENT_IGNORE_TRANSIENT_PS_FAILURES", 3))
+    self._potential_ps_failures_lock = threading.Lock()
+    self._potential_ps_failures_count = [0] * self._num_ps
+
+    self._closure_queue = _CoordinatedClosureQueue()
+    self.failure_handler = WorkerPreemptionHandler(context.get_server_def(),
+                                                   self)
     worker_device_strings = [
         "/job:worker/replica:0/task:%d" % i for i in range(self._num_workers)
     ]
@@ -774,6 +810,22 @@ class Cluster(object):
         Worker(i, w, self) for i, w in enumerate(worker_device_strings)
     ]
 
+  def _record_and_ignore_transient_ps_failure(self, e):
+    """Records potential PS failures and return if failure should be ignored."""
+    if self._transient_ps_failures_threshold <= 0 or not _is_ps_failure(e):
+      return False
+
+    ps_tasks = _extract_failed_ps_instances(str(e))
+    with self._potential_ps_failures_lock:
+      for t in ps_tasks:
+        self._potential_ps_failures_count[t] += 1
+        # The number of UnavailableError encountered on this PS task exceeds the
+        # maximum number of ignored error
+        if (self._potential_ps_failures_count[t] >=
+            self._transient_ps_failures_threshold):
+          return False
+    return True
+
   def schedule(self, function, args, kwargs):
     """Schedules `function` to be dispatched to a worker for execution.
 
@@ -787,7 +839,10 @@ class Cluster(object):
       A structure of `RemoteValue` object.
     """
     closure = Closure(
-        function, self._cancellation_mgr, args=args, kwargs=kwargs)
+        function,
+        self._closure_queue._cancellation_mgr,  # pylint: disable=protected-access
+        args=args,
+        kwargs=kwargs)
     self._closure_queue.put(closure)
     return closure._output_remote_values  # pylint: disable=protected-access
 
@@ -809,9 +864,7 @@ class Client(object):
   """An object to schedule and orchestrate remote function execution.
 
   A `Client` object represents a program used to create dataset, schedule
-  functions to be executed, and fetch the results of the functions. Operations
-  that will involve other tasks in the cluster, such as variable creation,
-  reading variables etc., should be performed within `client.context()`.
+  functions to be executed, and fetch the results of the functions.
 
   Currently, `Client` is not supported to be used in a standalone manner.
   It should be used in conjunction with `ParameterServerStrategyV2`. The
@@ -842,27 +895,9 @@ class Client(object):
     self._strategy = strategy
     self.cluster = Cluster(strategy._cluster_resolver)
 
-  @contextlib.contextmanager
-  def context(self):
-    """Context manager under which client distribution is in effect.
-
-    All distribution related methods using this `Client`, including those that
-    create and update variables, should be used within this context. This
-    context manager handles cluster fault tolerance in remote function
-    execution.
-
-    The context manager calls `join` automatically when exiting successfully.
-
-    Entering `Client.context` also enters the underlying strategy's scope, and
-    this means that `tf.distribute.get_strategy()` will return the strategy
-    object being used.
-
-    Yields:
-      Nothing.
-    """
-    with self._strategy.scope(), self._handle_parameter_server_failure():
-      yield
-    self.join()
+  @property
+  def strategy(self):
+    return self._strategy
 
   @contextlib.contextmanager
   def experimental_variable_partitioning_scope(self):
@@ -926,13 +961,10 @@ class Client(object):
         scheduled function since the last time an error was thrown or since
         the beginning of the program.
     """
-    # TODO(b/160702436): Invoke `strategy.run` for user's function so it enters
-    # a `ReplicaContext` in a logically correct way.
-    with distribute_lib.ReplicaContext(
-        self._strategy,
-        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
-      with self._translate_parameter_server_failure():
-        return self.cluster.schedule(fn, args=args, kwargs=kwargs)
+    # Slot variables are usually created during function tracing time; thus
+    # `schedule` needs to be called within the `strategy.scope()`.
+    with self.strategy.scope(), _translate_parameter_server_failure():
+      return self.cluster.schedule(fn, args=args, kwargs=kwargs)
 
   def join(self):
     """Blocks until all the scheduled functions have finished execution.
@@ -953,9 +985,7 @@ class Client(object):
         scheduled function since the last time an error was thrown or since
         the beginning of the program.
     """
-    # TODO(b/159486639): Update the docs once we can cancel the functions being
-    # executed on workers, that when `join` returns, the system is stabilized.
-    with self._translate_parameter_server_failure():
+    with _translate_parameter_server_failure():
       self.cluster.join()
 
   def done(self):
@@ -1054,31 +1084,32 @@ class Client(object):
       return (result,)
     return result
 
-  # pylint: disable=missing-function-docstring
-  @contextlib.contextmanager
-  def _translate_parameter_server_failure(self):
-    try:
-      yield
-    except Exception as e:  # pylint: disable=broad-except
-      if _is_ps_failure(e):
-        logging.exception("Encountered parameter server failures!")
-        raise ParameterServerFailureError(e)
-      else:
-        raise
 
-  # pylint: disable=missing-function-docstring
-  @contextlib.contextmanager
-  def _handle_parameter_server_failure(self):
-    try:
-      with self._translate_parameter_server_failure():
-        yield
-    except ParameterServerFailureError as e:  # pylint: disable=broad-except
-      restart_exit_code = os.environ.get(
-          "TF_CLIENT_NON_FATAL_RESTART_EXIT_CODE", None)
-      if restart_exit_code is not None:
-        sys.exit(int(restart_exit_code))
-      else:
-        raise
+# pylint: disable=missing-function-docstring
+@contextlib.contextmanager
+def _translate_parameter_server_failure():
+  try:
+    yield
+  except Exception as e:  # pylint: disable=broad-except
+    if _is_ps_failure(e):
+      raise ParameterServerFailureError(e)
+    else:
+      raise
+
+
+# pylint: disable=missing-function-docstring
+@contextlib.contextmanager
+def handle_parameter_server_failure():
+  try:
+    with _translate_parameter_server_failure():
+      yield
+  except ParameterServerFailureError as e:  # pylint: disable=broad-except
+    restart_exit_code = os.environ.get("TF_CLIENT_NON_FATAL_RESTART_EXIT_CODE",
+                                       None)
+    if restart_exit_code is not None:
+      sys.exit(int(restart_exit_code))
+    else:
+      raise
 
 
 class _PerWorkerDistributedDataset(object):
@@ -1101,9 +1132,7 @@ class _PerWorkerDistributedDataset(object):
     elif not isinstance(dataset_fn, tf_function.ConcreteFunction):
       with variable_scope.variable_creator_scope(disallow_variable_creation):
         dataset_fn = def_function.function(dataset_fn).get_concrete_function()
-    self._dataset_fn = (
-        client.cluster._cancellation_mgr.get_cancelable_function(  # pylint: disable=protected-access
-            dataset_fn))
+    self._dataset_fn = dataset_fn
     self._input_workers = input_workers
     self._client = client
     self._element_spec = None
@@ -1156,6 +1185,12 @@ class _PerWorkerDistributedIterator(PerWorkerValues):
                               "is not supported right now.")
 
 
+def _extract_failed_ps_instances(err_msg):
+  """Return a set of potentially failing ps instances from error message."""
+  tasks = re.findall("/job:ps/replica:0/task:[0-9]+", err_msg)
+  return set(int(t.split(":")[-1]) for t in tasks)
+
+
 def _is_ps_failure(error):
   """Whether the error is considered a parameter server failure."""
   if (_RPC_ERROR_FROM_PS in str(error) or
diff --git a/tensorflow/python/distribute/client/client_test.py b/tensorflow/python/distribute/client/client_test.py
index cf24f8c17ce..3ea3e46d6e8 100644
--- a/tensorflow/python/distribute/client/client_test.py
+++ b/tensorflow/python/distribute/client/client_test.py
@@ -19,34 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import platform
+import sys
 import threading
 import time
 from absl import logging
 
 from tensorflow.python.distribute.client import client
+from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import def_function
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.util import nest
 
 
-class MockCancellationManager(object):
-
-  def __init__(self):
-    self.cancelled = False
-
-  def start_cancel(self):
-    self.cancelled = True
-
-  def get_cancelable_function(self, func):
-    return func
-
-
 class CoordinatedClosureQueueTest(test.TestCase):
 
   def testBasic(self):
-    queue = client._CoordinatedClosureQueue(MockCancellationManager())
-    closure1 = self._create_closure()
+    queue = client._CoordinatedClosureQueue()
+    closure1 = self._create_closure(queue._cancellation_mgr)
     queue.put(closure1)
     self.assertIs(closure1, queue.get())
     self.assertFalse(queue.done())
@@ -57,7 +48,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     queue.wait()
 
   def testProcessAtLeaseOnce(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure_queue = client._CoordinatedClosureQueue()
     labels = ['A', 'B', 'C', 'D', 'E']
     processed_count = collections.defaultdict(int)
 
@@ -85,9 +76,9 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
       return func
 
+    cm = cancellation.CancellationManager()
     for label in labels:
-      closure_queue.put(
-          client.Closure(get_func(label), MockCancellationManager()))
+      closure_queue.put(client.Closure(get_func(label), cm))
     t1 = threading.Thread(target=process_queue, daemon=True)
     t1.start()
     t2 = threading.Thread(target=process_queue, daemon=True)
@@ -104,7 +95,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     coord.join([t1, t2])
 
   def testNotifyBeforeWait(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    closure_queue = client._CoordinatedClosureQueue()
 
     def func():
       logging.info('func running')
@@ -116,7 +107,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
         closure_queue.get()
         closure_queue.mark_finished()
 
-    closure_queue.put(client.Closure(func, MockCancellationManager()))
+    closure_queue.put(client.Closure(func, closure_queue._cancellation_mgr))
     t = threading.Thread(target=process_queue)
     t.start()
     coord.join([t])
@@ -148,8 +139,12 @@ class CoordinatedClosureQueueTest(test.TestCase):
     coord.join([t])
 
   def testWaitRaiseErrorAfterMarkFailure(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
-    closure_queue.put(self._create_closure())
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
+    closure_queue = client._CoordinatedClosureQueue()
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     closure = closure_queue.get()
 
     wait_finish_event = threading.Event()
@@ -172,20 +167,20 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
     self.assertTrue(closure_queue.done())
 
-  def _create_closure(self):
+  def _create_closure(self, cancellation_mgr):
 
     @def_function.function()
     def some_function():
       return 1.0
 
-    return client.Closure(some_function, MockCancellationManager())
+    return client.Closure(some_function, cancellation_mgr)
 
   def _put_two_closures_and_get_one(self):
-    closure_queue = client._CoordinatedClosureQueue(MockCancellationManager())
-    closure1 = self._create_closure()
+    closure_queue = client._CoordinatedClosureQueue()
+    closure1 = self._create_closure(closure_queue._cancellation_mgr)
     closure_queue.put(closure1)
 
-    closure2 = self._create_closure()
+    closure2 = self._create_closure(closure_queue._cancellation_mgr)
     closure_queue.put(closure2)
 
     closure_got = closure_queue.get()  # returns closure1
@@ -194,12 +189,16 @@ class CoordinatedClosureQueueTest(test.TestCase):
     return closure_queue, closure1, closure2
 
   def testPutRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, closure2 = self._put_two_closures_and_get_one()
 
     closure_queue.mark_failed(ValueError())
 
     with self.assertRaises(ValueError):
-      closure_queue.put(self._create_closure())
+      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
 
     self.assertTrue(closure_queue.done())
 
@@ -210,9 +209,13 @@ class CoordinatedClosureQueueTest(test.TestCase):
       closure2._fetch_output_remote_values()
 
     # The error is cleared.
-    closure_queue.put(self._create_closure())
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
 
   def testWaitRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, closure2 = self._put_two_closures_and_get_one()
 
     closure_queue.mark_failed(ValueError())
@@ -231,6 +234,10 @@ class CoordinatedClosureQueueTest(test.TestCase):
     closure_queue.wait()
 
   def testDoneRaiseError(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, _ = self._put_two_closures_and_get_one()
 
     self.assertFalse(closure_queue.done())
@@ -247,12 +254,19 @@ class CoordinatedClosureQueueTest(test.TestCase):
       closure_queue.mark_failed(e)
 
   def _test_cancel_closure_when_error(self, call_wait):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, closure1, closure2 = self._put_two_closures_and_get_one()
-    closure_queue.put(self._create_closure())
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     closure_queue.get()
     # At this moment, there are two inflight, one in queue.
     self.assertEqual(closure_queue._inflight_closure_count, 2)
 
+    # Hold a copy of the queue's cancellation manager at this point
+    initial_cm = closure_queue._cancellation_mgr
+
     # Simulating closure1 fails.
     self._set_error(closure_queue, closure1, ValueError('Some error.'))
 
@@ -260,7 +274,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self.assertEqual(closure_queue._queue.qsize(), 1)
     self.assertEqual(closure_queue._inflight_closure_count, 1)
 
-    closure3 = self._create_closure()
+    closure3 = self._create_closure(closure_queue._cancellation_mgr)
 
     def fake_cancellation():
       self._set_error(closure_queue, closure2,
@@ -278,8 +292,8 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
     self._assert_one_unblock_the_other(fake_cancellation, report_error)
 
-    # Cancellation manager has been called.
-    self.assertTrue(closure_queue._cancellation_mgr.cancelled)
+    # The original cancellation manager of the queue has been cancelled.
+    self.assertTrue(initial_cm.is_cancelled)
 
     # At this moment, there is zero inflight, nothing in queue.
     self.assertTrue(closure_queue._queue.empty())
@@ -314,20 +328,24 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self._test_cancel_closure_when_error(call_wait=False)
 
   def testStateIsRestoredAfterJoinIsCalled(self):
+    if sys.version_info >= (3, 8) and platform.system() == 'Windows':
+      # TODO(b/165013260): Fix this
+      self.skipTest('Test is currently broken on Windows with Python 3.8')
+
     closure_queue, _, _ = self._put_two_closures_and_get_one()
     self.assertEqual(closure_queue._inflight_closure_count, 1)
     closure_queue.mark_failed(ValueError('test error'))
     with self.assertRaises(ValueError):
-      closure_queue.put(self._create_closure())
+      closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
 
     # Its error should have been cleared.
     self.assertIsNone(closure_queue._error)
-    closure_queue.put(self._create_closure())
+    closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     self.assertIsNone(closure_queue._error)
 
   def testThreadSafey(self):
     thread_count = 10
-    queue = client._CoordinatedClosureQueue(MockCancellationManager())
+    queue = client._CoordinatedClosureQueue()
 
     # Each thread performs 20 queue actions: 10 are `put_back` and 10 are
     # `mark_finished`.
@@ -346,7 +364,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
       t.start()
 
     for _ in range(thread_count * action_count // 2):
-      queue.put(self._create_closure())
+      queue.put(self._create_closure(queue._cancellation_mgr))
     queue.wait()
     self.assertTrue(queue.done())
 
diff --git a/tensorflow/python/distribute/client/parameter_server_client_mpr_test.py b/tensorflow/python/distribute/client/parameter_server_client_mpr_test.py
new file mode 100644
index 00000000000..189f19107c7
--- /dev/null
+++ b/tensorflow/python/distribute/client/parameter_server_client_mpr_test.py
@@ -0,0 +1,127 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multi-process runner tests for parameter_server_client.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+from absl import logging
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute.client import client
+from tensorflow.python.distribute.client import parameter_server_client
+from tensorflow.python.distribute.client import utils
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+
+
+class ParameterServerClientMprTest(test.TestCase):
+
+  def testScheduleTranslatePSFailureError(self):
+    self._test_translate_ps_failure_error(test_schedule=True)
+
+  def testJoinTranslatePSFailureError(self):
+    self._test_translate_ps_failure_error(test_join=True)
+
+  def _test_translate_ps_failure_error(self,
+                                       test_schedule=False,
+                                       test_join=False):
+
+    def proc_func(functions_scheduled_event, test_finished_event):
+      cluster_resolver = TFConfigClusterResolver()
+      if cluster_resolver.task_type != "chief":
+        utils.start_server(cluster_resolver, "grpc")
+      ps_client = parameter_server_client.ParameterServerClient(
+          cluster_resolver)
+      with ps_client._strategy.scope():
+        v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+      @def_function.function
+      def worker_fn():
+        # An ever-running function.
+        for _ in math_ops.range(100000):
+          v.assign_add(1)
+
+      # Keep the two workers occupied.
+      ps_client.schedule(worker_fn)
+      ps_client.schedule(worker_fn)
+      # Now the main process can terminate.
+      functions_scheduled_event.set()
+
+      # Verified that join and schedule indeed raise
+      # ParameterServerFailureError.
+      try:
+        if test_join:
+          ps_client.join()
+        if test_schedule:
+          while ps_client.cluster._closure_queue._error is None:
+            time.sleep(1)
+          ps_client.schedule(worker_fn)
+      except client.ParameterServerFailureError:
+        # The following verifies that after PS fails, continue executing
+        # functions on workers should fail and indicate it's PS failure.
+        for worker_id in range(3):
+          with ops.device("/job:worker/replica:0/task:{}".format(worker_id)):
+            try:
+              # Executing a function after PS fails should result in a PS
+              # failure.
+              worker_fn()
+            except Exception as e:  # pylint: disable=broad-except
+              if client._is_ps_failure(e):
+                if worker_id < 2:
+                  continue
+                logging.info("_test_translate_ps_failure_error ends properly.")
+                # Now we can safely exit the test.
+                test_finished_event.set()
+                return
+            raise RuntimeError("Executing a function after PS fails, should "
+                               "result in a PS failure.")
+
+      raise RuntimeError("ParameterServerFailureError supposed to be raised.")
+
+    manager = multi_process_runner.manager()
+    functions_scheduled_event = manager.Event()
+    test_finished_event = manager.Event()
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=3, num_ps=1, has_eval=False),
+        args=(functions_scheduled_event, test_finished_event),
+        rpc_layer="grpc",
+        list_stdout=True,
+        use_dill_for_args=False)
+
+    mpr.start()
+    functions_scheduled_event.wait()
+    mpr.terminate("ps", 0)
+    while mpr.process_exists("ps", 0):
+      time.sleep(0.01)
+    test_finished_event.wait()
+    self.assertTrue(
+        any("_test_translate_ps_failure_error ends properly" in msg
+            for msg in mpr.join().stdout))
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/client/parameter_server_client_test.py b/tensorflow/python/distribute/client/parameter_server_client_test.py
index 32c7ff9c7e9..a4fb06d7a8b 100644
--- a/tensorflow/python/distribute/client/parameter_server_client_test.py
+++ b/tensorflow/python/distribute/client/parameter_server_client_test.py
@@ -24,6 +24,7 @@ import threading
 from absl import logging
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute.client import client
@@ -106,7 +107,7 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
 
   def testBasic(self):
     self.client._strategy.extended._variable_count = 0
-    with self.client.context():
+    with self.client.strategy.scope():
       v1 = variables.Variable(initial_value=0.0)
       v2 = variables.Variable(initial_value=1.0)
     self.assertEqual(self.client._strategy.extended._variable_count, 2)
@@ -140,7 +141,7 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
     def input_fn():
       return dataset_ops.DatasetV2.range(1, 2)
 
-    with self.client.context():
+    with self.client.strategy.scope():
       v = variables.Variable(initial_value=0, dtype=dtypes.int64)
 
     @def_function.function
@@ -164,7 +165,7 @@ class ParameterServerClientTest(TestCaseWithErrorReportingThread):
     def input_fn():
       return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
 
-    with self.client.context():
+    with self.client.strategy.scope():
       v = variables.Variable(initial_value=0, dtype=dtypes.int32)
 
     # TODO(yuefengz): the following tf.function has a return value which is None
@@ -258,7 +259,7 @@ class VariablePartitioningScopeTest(test.TestCase):
     cls.client = make_client(num_workers=3, num_ps=2)
 
   def testBasic(self):
-    with self.client.context():
+    with self.client.strategy.scope():
       with self.client.experimental_variable_partitioning_scope():
         init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
         v1 = variables.Variable(
@@ -288,7 +289,7 @@ class VariablePartitioningScopeTest(test.TestCase):
     self.assertAllEqual(v2.variables[1].read_value().numpy(), [[3], [4], [5]])
 
   def testSurplusPS(self):
-    with self.client.context():
+    with self.client.strategy.scope():
       with self.client.experimental_variable_partitioning_scope():
         initializer = init_ops_v2.Constant([0])
 
@@ -356,7 +357,7 @@ class ErrorReportingTest(TestCaseWithErrorReportingThread):
     super(ErrorReportingTest, cls).setUpClass()
     cls.client = make_client(num_workers=3, num_ps=2)
 
-    with cls.client.context():
+    with cls.client.strategy.scope():
       cls.iteration = variables.Variable(initial_value=0.0)
 
   @def_function.function
@@ -374,6 +375,15 @@ class ErrorReportingTest(TestCaseWithErrorReportingThread):
     self.iteration.assign_add(1.0)
     return self.iteration
 
+  @def_function.function
+  def _long_function(self):
+    x = random_ops.random_uniform((1000, 1000))
+    for _ in math_ops.range(10000):
+      a = random_ops.random_uniform((1000, 1000))
+      b = random_ops.random_uniform((1000, 1000))
+      x += math_ops.matmul(a, b)
+    return x
+
   def testJoinRaiseError(self):
     for _ in range(3):
       self.client.schedule(self._normal_function)
@@ -436,6 +446,22 @@ class ErrorReportingTest(TestCaseWithErrorReportingThread):
     with self.assertRaises(client.InputError):
       self.client.join()
 
+  def testCancellation(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    long_function = self.client.schedule(self._long_function)
+    self.client.schedule(self._error_function)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+    with self.assertRaises(client.FunctionRetryableError):
+      long_function.fetch()
+
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.join()
+
 
 class LimitedClosureQueueErrorTest(ErrorReportingTest):
   """Test error reporting works with explicit maximum closure queue size.
@@ -450,9 +476,43 @@ class LimitedClosureQueueErrorTest(ErrorReportingTest):
     client._CLOSURE_QUEUE_MAX_SIZE = 2
     cls.client = make_client(num_workers=3, num_ps=2)
 
-    with cls.client.context():
+    with cls.client.strategy.scope():
       cls.iteration = variables.Variable(initial_value=0.0)
 
 
+class StrategyRunTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(StrategyRunTest, cls).setUpClass()
+    cls.client = make_client(num_workers=1, num_ps=1)
+
+  def testStrategyRun(self):
+    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    with self.client._strategy.scope():
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      v = variables.Variable(initial_value=1)
+
+      @def_function.function
+      def worker_fn(input_tensor):
+
+        def replica_fn(input_tensor):
+          # Within `replica_fn`, it has to be in a replica context.
+          self.assertFalse(
+              distribution_strategy_context.in_cross_replica_context())
+          return input_tensor + v
+
+        return self.client._strategy.run(replica_fn, args=(input_tensor,))
+
+      # Asserting scheduling in scope has the expected behavior.
+      result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
+      self.assertIsInstance(result, client.RemoteValue)
+      self.assertEqual(result.fetch(), 4)
+
+    # Asserting scheduling out of scope has the expected behavior.
+    result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
+    self.assertEqual(result.fetch(), 4)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/client/utils.py b/tensorflow/python/distribute/client/utils.py
new file mode 100644
index 00000000000..6c595579863
--- /dev/null
+++ b/tensorflow/python/distribute/client/utils.py
@@ -0,0 +1,43 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TF2 parameter server training utilities.
+
+Parameter server training in TF2 is currently under development.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import logging
+from tensorflow.python.training import server_lib
+
+
+def start_server(cluster_resolver, protocol):
+  """Start a server and block the process from exiting."""
+  # This function is for multi-processing test or users who would like to have
+  # every job run the same binary for simplicity.
+  assert (cluster_resolver.task_type == 'worker' or
+          cluster_resolver.task_type == 'ps')
+  server = server_lib.Server(
+      cluster_resolver.cluster_spec().as_cluster_def(),
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=protocol)
+
+  logging.info('TensorFlow server started for job %s, task %d.',
+               cluster_resolver.task_type, cluster_resolver.task_id)
+
+  # Blocking the process that starts a server from exiting.
+  server.join()
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index eeef87f5765..6a133c7d4b8 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import threading
+import time
 import weakref
 
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -37,6 +39,8 @@ from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
@@ -176,6 +180,16 @@ class CollectiveAllReduceStrategyV1(distribute_lib.StrategyV1):
 class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   """Implementation of CollectiveAllReduceStrategy."""
 
+  # Whether to perdically check the health of the cluster. If any worker is not
+  # reachable, collectives are aborted and the user program should get a
+  # tf.errors.UnavailableError. It's required to restart in order to recover.
+  _enable_check_health = False
+  # Check health interval in seconds.
+  _check_health_interval = 30
+  # Timeout in seconds for the first check health. The first check health needs
+  # to wait for cluster, which may make a longer time.
+  _check_health_initial_timeout = 1200
+
   def __init__(self,
                container_strategy,
                communication,
@@ -370,6 +384,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._rpc_layer = cluster_resolver.rpc_layer
     self._warn_nccl_no_gpu()
 
+    # TODO(b/151232436): Enable check health thread by default.
+    if self._enable_check_health:
+      self._start_check_health_thread()
+
     logging.info(
         "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
         "task_id = %r, num_workers = %r, local_devices = %r, "
@@ -377,6 +395,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         task_id, self._num_workers, local_devices,
         self._communication)
 
+  def __del__(self):
+    if self._enable_check_health:
+      self._stop_check_health_thread()
+
   def _input_workers_with_options(self, options=None):
     host_device = device_util.get_host_for_device(self._worker_device)
     if not options or options.experimental_prefetch_to_device:
@@ -607,6 +629,88 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         destinations=destinations,
         experimental_hints=experimental_hints)
 
+  def _check_health(self, device, group_key, instance_key):
+    first = True
+    # We need to use a large enough value so that the all-reduce forms a
+    # complete RING. In RING implementation, when value is too small, the
+    # all-reduce may degrade into broadcasts. This means that some worker
+    # failure may not be detected.
+    value = array_ops.ones((32, 32), dtype=dtypes.float32)
+    while True:
+      if self._check_health_thread_should_stop.is_set():
+        return
+      timeout = None
+      if first:
+        # For the first check health we set timeout since it may need to do
+        # group resolution, which may hang if the cluster is never healthy.
+        timeout = self._check_health_initial_timeout
+        first = False
+      try:
+        # We use an dummy all-reduce as a way to check the health of a cluster.
+        # For RING it should be able to detect failed workers in the cluster if
+        # the values are large enough.
+        #
+        # We're not using CrossDeviceOps because we need to run it with
+        # pre-allocated group and instance keys.
+        #
+        # TODO(b/151232436): Replace the reduce with a check health op once we
+        # add that.
+        with ops.device(device):
+          collective_ops.all_reduce(
+              value,
+              group_size=self._num_workers,
+              group_key=group_key,
+              instance_key=instance_key,
+              merge_op="Add",
+              final_op="Id",
+              subdiv_offsets=[0],
+              communication_hint="ring",
+              timeout=timeout)
+          if context.is_async():
+            context.async_wait()
+      except (errors.UnavailableError, errors.DeadlineExceededError,
+              errors.FailedPreconditionError, errors.CancelledError) as e:
+        # TODO(b/151232436): Always raise UnavailableError when a peer fails.
+        # Now there could be many kinds of errors:
+        # - Unavailable: when the peer is not reachable, e.g. it's down.
+        # - FailedPrecondition: when the peer has restarted.
+        # - DeadlineExceeded: when the first check health exceeds the deadline,
+        #   e.g. the peers take too long to be ready.
+        # - Cancelled: when failures in organic collectives aborts first,
+        #   outgoing RPCs may be aborted with Cancelled.
+        logging.error("Cluster check alive failed, aborting collectives")
+        context.context().abort_collective_ops(
+            errors.UNAVAILABLE, "cluster check alive failed: %s" % e)
+      except Exception as e:  # pylint: disable=broad-except
+        logging.exception("Unexpected exception in check alive.")
+        context.context().abort_collective_ops(
+            errors.INTERNAL, "unexecpted exception in check alive: %s" % e)
+        return
+      time.sleep(self._check_health_interval)
+
+  def _start_check_health_thread(self):
+    # Allocate group and instance key before starting the thread to avoid
+    # indeterminism. There can only be one thread that assigns group keys and
+    # instance keys, otherwise different workers may end up with unmatched keys
+    # since execution order between threads are arbitrary.
+    device = device_util.canonicalize(self._worker_device)
+    group_key = self._collective_keys.get_group_key([device])
+    instance_key = self._collective_keys.get_op_instance_key()
+    self._check_health_thread_should_stop = threading.Event()
+    # Start the thread as daemon to avoid it blocking the program from exiting.
+    # We try best to shutdown the thread but __del__ is not guaranteed to be
+    # called when program exists.
+    self._check_health_thread = threading.Thread(
+        target=self._check_health,
+        args=(device, group_key, instance_key),
+        daemon=True)
+    self._check_health_thread.start()
+
+  def _stop_check_health_thread(self):
+    self._check_health_thread_should_stop.set()
+    self._check_health_thread.join()
+    self._check_health_thread = None
+
   def _warn_nccl_no_gpu(self):
     if ((self._communication ==
          cross_device_ops_lib.CollectiveCommunication.NCCL) and
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 6b68e4aadef..a835f5e5ac9 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -635,8 +635,85 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.multidevice_strategies,
-          mode=["eager"]
-      ))
+          mode=["eager"]))
+  def testSegmentSumWithDynamicNumberOfSegments(self, distribution):
+
+    def dataset_fn(_):
+      data = array_ops.zeros(5, dtype=dtypes.int32)
+      dataset = get_dataset_from_tensor_slices(data)
+      dataset = dataset.batch(3)
+      return dataset
+
+    input_iterator = iter(
+        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def step_fn(example):
+      segment_ids = array_ops.zeros_like_v2(example)
+      num_segment = array_ops.shape(example)[0]
+      # If number of segments is dynamic, output should be a dynamic shape.
+      return math_ops.unsorted_segment_sum(example, segment_ids, num_segment)
+
+    # This assumes that there are exactly 2 replicas
+    outputs = distribution.experimental_local_results(
+        distribution.run(step_fn, args=(next(input_iterator),)))
+    self.assertAllEqual((3,), outputs[0].shape)
+    self.assertAllEqual((2,), outputs[1].shape)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.multidevice_strategies,
+          mode=["eager"]))
+  def testReshapeWithDynamicInputs(self, distribution):
+
+    def dataset_fn(_):
+      data = array_ops.zeros((5, 1, 2), dtype=dtypes.int32)
+      dataset = get_dataset_from_tensor_slices(data)
+      dataset = dataset.batch(3)
+      return dataset
+
+    input_iterator = iter(
+        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def step_fn(example):
+      # example: [<=3, 1, 2]
+      # tile: [<=3, <=3, 2]
+      tile = array_ops.tile(example, [1, array_ops.shape(example)[0], 1])
+      # reshape1: [<=(3*3 = 9), 2]
+      reshape1 = array_ops.reshape(tile, [-1, 2])
+
+      # reshape2: [<=3, <=3, 2]
+      reshape2 = array_ops.reshape(
+          reshape1,
+          [array_ops.shape(example)[0],
+           array_ops.shape(example)[0], 2])
+
+      # reshape3: [<=3, -1, 2]
+      reshape3 = array_ops.reshape(reshape1,
+                                   [array_ops.shape(example)[0], -1, 2])
+      # reshape4: [-1, <=3, 2]
+      reshape4 = array_ops.reshape(reshape1,
+                                   [-1, array_ops.shape(example)[0], 2])
+      return [reshape1, reshape2, reshape3, reshape4]
+
+    # This assumes that there are exactly 2 replicas
+    outputs = distribution.experimental_local_results(
+        distribution.run(step_fn, args=(next(input_iterator),)))
+    self.assertAllEqual((9, 2), outputs[0][0].values[0].shape)
+    self.assertAllEqual((3, 3, 2), outputs[0][1].values[0].shape)
+    self.assertAllEqual((3, 3, 2), outputs[0][2].values[0].shape)
+    self.assertAllEqual((3, 3, 2), outputs[0][3].values[0].shape)
+
+    self.assertAllEqual((4, 2), outputs[0][0].values[1].shape)
+    self.assertAllEqual((2, 2, 2), outputs[0][1].values[1].shape)
+    self.assertAllEqual((2, 2, 2), outputs[0][2].values[1].shape)
+    self.assertAllEqual((2, 2, 2), outputs[0][3].values[1].shape)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.multidevice_strategies,
+          mode=["eager"]))
   def testDynamicShapesWithFirstReplicaNotMaximumShape(self, distribution):
     def dataset_fn(_):
       dataset1 = get_dataset_from_tensor_slices([[1., 2.], [1., 2.]])
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index e593830f038..173caa364a9 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1659,7 +1659,7 @@ class Strategy(StrategyBase):
       number of partitions specified by the device assignment.
 
     Returns:
-      Annotated tensor with idential value as `tensor`.
+      Annotated tensor with identical value as `tensor`.
     """
     return self._extended._experimental_assign_to_logical_device(  # pylint: disable=protected-access
         tensor, logical_device_id)
@@ -1722,7 +1722,7 @@ class Strategy(StrategyBase):
         value in `partition_dimensions`.
 
     Returns:
-      Annotated tensor with idential value as `tensor`.
+      Annotated tensor with identical value as `tensor`.
     """
     return self._extended._experimental_split_to_logical_devices(  # pylint: disable=protected-access
         tensor, partition_dimensions)
@@ -1772,7 +1772,7 @@ class Strategy(StrategyBase):
       tensor: Input tensor to annotate.
 
     Returns:
-      Annotated tensor with idential value as `tensor`.
+      Annotated tensor with identical value as `tensor`.
     """
     return self._extended._experimental_replicate_to_logical_devices(tensor)  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index 307f2580996..361c8a42dbe 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -32,6 +32,7 @@ cuda_py_test(
     ],
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
diff --git a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
index c247be1c280..003fb5f1a33 100644
--- a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
+++ b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
@@ -26,12 +26,19 @@ import os
 
 import tensorflow as tf
 
+from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import test
 
 
+# Put it in top level so it executes in the child processes as well.
+mwms_lib.CollectiveAllReduceExtended._enable_check_health = True
+mwms_lib.CollectiveAllReduceExtended._check_health_interval = 3
+mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 6
+
+
 def get_attempt(strategy, attempts):
   task_type = strategy.cluster_resolver.task_type
   task_id = strategy.cluster_resolver.task_id
@@ -62,11 +69,70 @@ class PeerFailureTest(test.TestCase):
   # events in real world. E.g. some tests make a worker fail on the first
   # attempt only, and asserts that it should recovery.
 
-  def test_creating_variable_broken(self):
+  def test_creating_variable(self):
     # This test simulates the case when a worker fails before or during creating
     # a variable. Creating variables involve broadcasting the initial value from
     # the first replica to all replicas.
 
+    def worker_fn():
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      with strategy.scope():
+        tf.Variable(1.)
+        # worker-1 dies here.
+        if strategy.cluster_resolver.task_id == 1:
+          quick_exit(1)
+        v = tf.Variable(tf.random.uniform(()))
+        return v.read_value().numpy()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start()
+    # TODO(b/151232436): Always raise UnavailableError when a peer fails.
+    with self.assertRaises(
+        (tf.errors.UnavailableError, tf.errors.DeadlineExceededError)):
+      mpr.join(timeout=30)
+
+  def test_reduce_small_tensor(self):
+    # This test simulates the case when a worker fails before or during reducing
+    # a small tensors, e.g. reading a metric.
+    #
+    # Note that this is written for a specific corner case that used to happen
+    # only when all of the following conditions are met:
+    #   - There're two workers.
+    #   - They're reducing a small tensor. The definition of small varies
+    #     per platform.
+    #   - They're reducing a single tensor. Batched all-reduce are not affected.
+    #   - It must be worker-1 that fails.
+    # Under this case, the all-reduce is effectively two send/recv operation,
+    # the first one from worker-0 to worker-1, and the second one vice versa.
+    # The first one blocks the second one. In send/recv, the sending party is
+    # not aware of the failures of the receiving party.
+
+    def worker_fn():
+      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+      value = tf.identity([1.])
+      strategy.reduce("sum", value, axis=None)
+      # worker-1 dies here.
+      if strategy.cluster_resolver.task_id == 1:
+        quick_exit(1)
+      strategy.reduce("sum", value, axis=None)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start()
+    # TODO(b/151232436): Always raise UnavailableError when a peer fails.
+    with self.assertRaises(
+        (tf.errors.UnavailableError, tf.errors.DeadlineExceededError)):
+      mpr.join(timeout=30)
+
+
+class PeerFailureRecoverTest(test.TestCase):
+  # Similar to PeerFailureTest but simulates the situation where there's some
+  # external system that automatically restarts failed workers.
+
+  def test_creating_variable(self):
+    # See PeerFailureTest.test_creating_variable
+
     def worker_fn(attempts):
       strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
       task_id, attempt = get_attempt(strategy, attempts)
@@ -83,23 +149,11 @@ class PeerFailureTest(test.TestCase):
     mpr = multi_process_runner.MultiProcessRunner(
         worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
     mpr.start()
-    # TODO(b/151232436): worker-0 should raises Unavailable instead of hanging.
-    # Now after worker-1 fails, worker-0 waits on the second variable creation;
-    # after worker-1 recovers, worker-1 waits on the first variable creation.
-    with self.assertRaises(multi_process_runner.SubprocessTimeoutError):
-      mpr.join(timeout=30)
+    results = mpr.join(timeout=90).return_value
+    self.assertEqual(results[0], results[1])
 
-  def test_reduce_small_tensor_broken(self):
-    # This test simulates the case when a worker fails before or during reducing
-    # a small tensors, e.g. reading a metric.
-    #
-    # Note that this is a rather corner case and only happens when all of the
-    # following conditions are met:
-    #   - There're two workers.
-    #   - They're reducing a small tensor. The definition of small varies
-    #     per platform.
-    #   - They're reducing a single tensor. Batched all-reduce are not affected.
-    #   - It must be worker-1 that fails.
+  def test_reduce_small_tensor(self):
+    # See PeerFailureTest.test_reduce_small_tensor
 
     def worker_fn(attempts):
       strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
@@ -109,18 +163,15 @@ class PeerFailureTest(test.TestCase):
       # worker-1 dies here.
       if attempt == 1 and task_id == 1:
         quick_exit(1)
-      strategy.reduce("sum", value, axis=None)
+      return strategy.reduce("sum", value, axis=None).numpy()
 
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     attempts = multi_process_runner.manager().dict()
     mpr = multi_process_runner.MultiProcessRunner(
         worker_fn, cluster_spec, args=(attempts,), auto_restart=True)
     mpr.start()
-    # TODO(b/151232436): worker-0 should raises Unavailable instead of hanging.
-    # Now after worker-1 fails, worker-0 waits on the second reduce; after
-    # worker-1 recovers, worker-1 waits on the first reduce.
-    with self.assertRaises(multi_process_runner.SubprocessTimeoutError):
-      mpr.join(timeout=30)
+    results = mpr.join(timeout=90).return_value
+    self.assertAllEqual(results, [[2.], [2.]])
 
   def test_quick_recover(self):
     # This test simulates the case when a worker fails but recovers quickly
@@ -131,12 +182,14 @@ class PeerFailureTest(test.TestCase):
     # failed workers.
 
     def worker_fn(attempts):
+      # Set a long check alive interval to better simulate the case when a
+      # worker fails and recovers during a check alive interval.
+      mwms_lib.CollectiveAllReduceExtended._check_alive_interval = 30
+      mwms_lib.CollectiveAllReduceExtended._check_alive_initial_timeout = 30
+
       strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
       task_id, attempt = get_attempt(strategy, attempts)
 
-      if attempt == 2 and task_id == 1:
-        multi_process_runner.barrier().wait()
-
       @tf.function
       def replica_fn():
         ctx = tf.distribute.get_replica_context()
@@ -149,10 +202,6 @@ class PeerFailureTest(test.TestCase):
       # worker-1 dies here.
       if attempt == 1 and task_id == 1:
         quick_exit(1)
-      # Make worker-0 waits for worker-1 to restart before entering the next
-      # collective to simulate a quick recovery of worker-1.
-      if attempt == 1 and task_id == 0:
-        multi_process_runner.barrier().wait()
       strategy.run(replica_fn)
 
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index b36c8e978b4..b7ed48de0a0 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -187,11 +187,12 @@ class MultiProcessRunner(object):
                        'one chief. Current `cluster_spec` has {} chiefs.'
                        .format(len(cluster_spec['chief'])))
     if not multi_process_lib.initialized():
-      raise RuntimeError('`multi_process_runner` is not initialized. '
-                         'Please call `multi_process_runner.test_main()` '
-                         'within `if __name__ == \'__main__\':` block '
-                         'in your python module to properly initialize '
-                         '`multi_process_runner`.')
+      raise MultiProcessRunnerNotInitializedError(
+          '`multi_process_runner` is not initialized. '
+          'Please call `multi_process_runner.test_main()` '
+          'within `if __name__ == \'__main__\':` block '
+          'in your python module to properly initialize '
+          '`multi_process_runner`.')
     if not callable(proc_func):
       raise ValueError('proc_func is not a callable')
 
@@ -612,8 +613,12 @@ class MultiProcessRunner(object):
         self._watchdog_thread.join()
       process_statuses = self._get_process_statuses()
       self._reraise_if_subprocess_error(process_statuses)
-      raise SubprocessTimeoutError('one or more subprocesses timed out.',
-                                   self._get_mpr_result(process_statuses))
+      raise SubprocessTimeoutError(
+          'One or more subprocesses timed out, where timeout was set to {}s. '
+          'Please change the `timeout` argument for '
+          '`MultiProcessRunner.join()` or `multi_process_runner.run()` '
+          'if it should be adjusted.'.format(timeout),
+          self._get_mpr_result(process_statuses))
 
     for (task_type, task_id), p in self._processes.items():
       logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)
@@ -1051,6 +1056,16 @@ class UnexpectedSubprocessExitError(RuntimeError):
     self.mpr_result = mpr_result
 
 
+class MultiProcessRunnerNotInitializedError(RuntimeError):
+  """An error indicating `MultiProcessRunner` is used without initialization.
+
+  When this is raised, user is supposed to call
+  `multi_process_runner.test_main()` within `if __name__ == '__main__':` block
+  to properly initialize `multi_process_runner`.
+  """
+  pass
+
+
 def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   """Set TF_CONFIG environment variable."""
   tf_config_dict = {
diff --git a/tensorflow/python/distribute/multi_process_runner_no_init_test.py b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
index 2a1fe2551b9..9276555c26b 100644
--- a/tensorflow/python/distribute/multi_process_runner_no_init_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
@@ -30,8 +30,9 @@ class MultiProcessRunnerNoInitTest(test.TestCase):
     def simple_func():
       return 'foobar'
 
-    with self.assertRaisesRegex(RuntimeError,
-                                '`multi_process_runner` is not initialized.'):
+    with self.assertRaisesRegex(
+        multi_process_runner.MultiProcessRunnerNotInitializedError,
+        '`multi_process_runner` is not initialized.'):
       multi_process_runner.run(
           simple_func,
           multi_worker_test_base.create_cluster_spec(num_workers=1))
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 408cad2ca0a..b0c51f4767f 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -41,6 +41,9 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import remote
 from tensorflow.python.framework import errors
@@ -200,6 +203,156 @@ def create_in_process_cluster(num_workers,
   return cluster
 
 
+class MultiProcessCluster(object):
+  """A cluster of TensorFlow servers in separate processes.
+
+  This class is not thread-safe.
+  """
+
+  def __init__(self, cluster_resolver):
+    self._cluster_resolver = cluster_resolver
+    self._cluster_spec = cluster_resolver.cluster_spec().as_dict()
+    self._rpc_layer = cluster_resolver.rpc_layer
+    self._start_events = {}
+    self._finish_events = {}
+    self._mpr_manager = multi_process_runner.manager()
+
+    def task_function(start_events, finish_events):
+      cluster_resolver = TFConfigClusterResolver()
+      cluster_spec = cluster_resolver.cluster_spec()
+      task_type = cluster_resolver.task_type
+      task_id = cluster_resolver.task_id
+      rpc_layer = cluster_resolver.rpc_layer
+
+      logging.info(
+          'Starting server with cluster_spec = %r, task_type = %r, '
+          'task_id = %r, rpc_layer = %r', cluster_spec, task_type, task_id,
+          rpc_layer)
+
+      # TODO(yuefengz): support GPU clusters.
+      server_config = config_pb2.ConfigProto()
+      server_config.device_count['GPU'] = 0
+
+      server_lib.Server(
+          cluster_spec,
+          job_name=task_type,
+          protocol=rpc_layer,
+          task_index=task_id,
+          config=server_config,
+          start=True)
+
+      start_event = start_events[task_type][task_id]
+      start_event.set()
+
+      finish_event = finish_events[task_type][task_id]
+      finish_event.wait()
+
+      os._exit(0)  # pylint: disable=protected-access
+
+    self._task_function = task_function
+    self._mpr = None
+
+  def start(self):
+    """Starts one TensorFlow server for each task in the cluster_resolver.
+
+    It will wait until all the servers are up before returns.
+    """
+    if self._mpr:
+      raise ValueError('The cluster has already been started.')
+    for task_type, task_addresses in self._cluster_spec.items():
+      self._start_events[task_type] = []
+      self._finish_events[task_type] = []
+      for _ in task_addresses:
+        self._start_events[task_type].append(self._mpr_manager.Event())
+        self._finish_events[task_type].append(self._mpr_manager.Event())
+
+    self._mpr = multi_process_runner.MultiProcessRunner(
+        self._task_function,
+        self._cluster_spec,
+        args=(self._start_events, self._finish_events),
+        rpc_layer=self._rpc_layer,
+        stream_stdout=False,
+        list_stdout=False,
+        use_dill_for_args=False)
+    self._mpr.start()
+    for task_type, task_addresses in self._cluster_spec.items():
+      for i in range(len(task_addresses)):
+        self._start_events[task_type][i].wait()
+
+  def stop(self):
+    """Stops all the servers."""
+    for task_type, task_addresses in self._cluster_spec.items():
+      for i in range(len(task_addresses)):
+        self._finish_events[task_type][i].set()
+    try:
+      self._mpr.join()
+    except multi_process_runner.UnexpectedSubprocessExitError:
+      # TODO(yuefengz): investigate why processes exit with 255.
+      pass
+    self._mpr = None
+    self._start_events = {}
+    self._finish_events = {}
+
+  def kill_task(self, task_type, task_id):
+    """Kill a server given task_type and task_id.
+
+    Args:
+      task_type: the type of the task such as "worker".
+      task_id: the id the task such as 1.
+    """
+    assert self._mpr
+    if (not self._start_events[task_type][task_id].is_set() or
+        self._finish_events[task_type][task_id].is_set()):
+      raise ValueError("The task %s:%d doesn't exist." % (task_type, task_id))
+
+    self._finish_events[task_type][task_id].set()
+    self._mpr._processes[(task_type, task_id)].join()
+
+  def start_task(self, task_type, task_id):
+    """Starts a server given task_type and task_id.
+
+    Args:
+      task_type: the type of the task such as "worker".
+      task_id: the id the task such as 1.
+
+    Raises:
+      ValueError: if the server alreay exists.
+    """
+    assert self._mpr
+
+    if (not self._start_events[task_type][task_id].is_set() or
+        not self._finish_events[task_type][task_id].is_set()):
+      raise ValueError(
+          'The task %s:%d is still alive. You cannot start another one.' %
+          (task_type, task_id))
+    self._start_events[task_type][task_id] = self._mpr_manager.Event()
+    self._finish_events[task_type][task_id] = self._mpr_manager.Event()
+    self._mpr.start_single_process(task_type=task_type, task_id=task_id)
+    self._start_events[task_type][task_id].wait()
+
+  @property
+  def cluster_resolver(self):
+    return copy.deepcopy(self._cluster_resolver)
+
+
+def create_multi_process_cluster(num_workers,
+                                 num_ps,
+                                 has_chief=False,
+                                 has_eval=False,
+                                 rpc_layer='grpc'):
+  cluster_spec = create_cluster_spec(
+      has_chief=has_chief,
+      num_workers=num_workers,
+      num_ps=num_ps,
+      has_eval=has_eval)
+
+  cluster = MultiProcessCluster(
+      SimpleClusterResolver(
+          server_lib.ClusterSpec(cluster_spec), rpc_layer=rpc_layer))
+  cluster.start()
+  return cluster
+
+
 # TODO(rchao): Remove `test_obj` once estimator repo picks up the updated
 # nightly TF.
 def create_cluster_spec(has_chief=False,
diff --git a/tensorflow/python/distribute/multi_worker_test_base_test.py b/tensorflow/python/distribute/multi_worker_test_base_test.py
new file mode 100644
index 00000000000..e660d289a5b
--- /dev/null
+++ b/tensorflow/python/distribute/multi_worker_test_base_test.py
@@ -0,0 +1,82 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi-process clusters."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+
+
+class MultiProcessClusterTest(test.TestCase):
+
+  def setUp(self):
+    super(MultiProcessClusterTest, self).setUp()
+    self._cluster = multi_worker_test_base.create_multi_process_cluster(
+        num_workers=2, num_ps=1, has_chief=True, rpc_layer="grpc")
+    remote.connect_to_cluster(
+        self._cluster.cluster_resolver.cluster_spec(), protocol="grpc")
+    context.ensure_initialized()
+
+  def testClusterIsAlive(self):
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:1"))
+    self.assertTrue(context.check_alive("/job:ps/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:chief/replica:0/task:0"))
+
+  def testKillAndStartTask(self):
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+
+    # It is not allowed to start a task before killing it.
+    with self.assertRaises(ValueError):
+      self._cluster.start_task("worker", 0)
+
+    self._cluster.kill_task("worker", 0)
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+
+    # The task is already killed.
+    with self.assertRaises(ValueError):
+      self._cluster.kill_task("worker", 0)
+
+    self._cluster.start_task("worker", 0)
+
+    # Without a call to update_server_def, the next check_alive will return
+    # False. Alternatively sleeping for 2 seconds here also works.
+    context.context().update_server_def(context.get_server_def())
+
+    self.assertTrue(context.check_alive("/job:worker/replica:0/task:0"))
+
+  def testStop(self):
+    self._cluster.stop()
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:0"))
+    self.assertFalse(context.check_alive("/job:worker/replica:0/task:1"))
+    self.assertFalse(context.check_alive("/job:ps/replica:0/task:0"))
+    self.assertFalse(context.check_alive("/job:chief/replica:0/task:0"))
+
+  def testClusterResolverProperty(self):
+    cluster_spec = self._cluster.cluster_resolver.cluster_spec().as_dict()
+
+    self.assertEqual(len(cluster_spec["worker"]), 2)
+    self.assertEqual(len(cluster_spec["ps"]), 1)
+    self.assertEqual(len(cluster_spec["chief"]), 1)
+
+
+if __name__ == "__main__":
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2.py b/tensorflow/python/distribute/parameter_server_strategy_v2.py
index 02f3c35a716..718fa809153 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2.py
@@ -24,8 +24,11 @@ from __future__ import print_function
 
 from absl import logging
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import tf_contextlib
@@ -198,5 +201,8 @@ class ParameterServerStrategyV2Extended(
     return sharded_variable_creator
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    # TODO(rchao): Consider implementing sync PS training.
-    raise NotImplementedError("Sync PS training is not implemented yet.")
+    with distribute_lib.ReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      # TODO(rchao): Support multi-replica per worker or sync-group.
+      return distribute_utils.regroup((fn(*args, **kwargs),))
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
new file mode 100644
index 00000000000..ad4e36baf38
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
@@ -0,0 +1,72 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for parameter_server_strategy_v2.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+import sys
+
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variables
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class ParameterServerStrategyV2Test(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ParameterServerStrategyV2Test, cls).setUpClass()
+    cluster_def = multi_worker_test_base.create_in_process_cluster(
+        num_workers=2, num_ps=3, rpc_layer="grpc")
+    cls.cluster_resolver = SimpleClusterResolver(
+        ClusterSpec(cluster_def), rpc_layer="grpc")
+    remote.connect_to_cluster(
+        cls.cluster_resolver.cluster_spec(),
+        job_name="chief",
+        protocol=cls.cluster_resolver.rpc_layer)
+
+  def testVariablePlacement(self):
+
+    if sys.version_info >= (3, 8) and platform.system() == "Windows":
+      # TODO(b/165013260): Fix this
+      self.skipTest("Test is currently broken on Windows with Python 3.8")
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    v1 = variables.Variable(initial_value=0.0)
+    with strategy.scope():
+      v2 = variables.Variable(initial_value=1.0)
+      v3 = variables.Variable(initial_value=2.0)
+      v4 = variables.Variable(initial_value=3.0)
+      v5 = variables.Variable(initial_value=4.0)
+    # v1 was created outside scope so should be on client.
+    self.assertEqual(v1.device, "/job:chief/replica:0/task:0/device:CPU:0")
+    # v2 through v5 are created in scope and in a round-robin manner.
+    self.assertEqual(v2.device, "/job:ps/replica:0/task:0/device:CPU:0")
+    self.assertEqual(v3.device, "/job:ps/replica:0/task:1/device:CPU:0")
+    self.assertEqual(v4.device, "/job:ps/replica:0/task:2/device:CPU:0")
+    self.assertEqual(v5.device, "/job:ps/replica:0/task:0/device:CPU:0")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index db03c66850c..a257a022dfa 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import weakref
 
 from tensorflow.python.distribute import distribute_lib
@@ -43,6 +44,36 @@ class AggregatingVariable(variables_lib.Variable, core.Tensor):
     v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
     self._aggregation = aggregation
 
+  def __deepcopy__(self, memo):
+    """Perform a deepcopy of the `AggregatingVariable`.
+
+    Unlike the deepcopy of a regular tf.Variable, this keeps the original
+    strategy and devices of the `AggregatingVariable`.  To avoid confusion
+    with the behavior of deepcopy on a regular `Variable` (which does
+    copy into new devices), we only allow a deepcopy of a `AggregatingVariable`
+    within its originating strategy scope.
+
+    Args:
+      memo: The memoization object for `deepcopy`.
+
+    Returns:
+      A deep copy of the current `AggregatingVariable`.
+
+    Raises:
+      RuntimeError: If trying to deepcopy into a different strategy.
+    """
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      v = copy.deepcopy(self._v, memo)
+
+    copied_variable = type(self)(
+        strategy=self._distribute_strategy,
+        v=v,
+        aggregation=self._aggregation)
+
+    memo[id(self)] = copied_variable
+
+    return copied_variable
+
   def get(self):
     return self._v
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index c1318927ca8..c2aa68a0785 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -454,8 +454,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(expected_result, run(input_iterator))
     self.assertAllEqual((0.,), w.read_value())
 
-  # TODO(b/140633529): Re-enable the test.
-  def disable_test_experimental_run_output_on_device(self, enable_packed_var):
+  def test_experimental_run_output_on_device(self, enable_packed_var):
     strategy = get_tpu_strategy(enable_packed_var)
 
     def computation(x):
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 87b711ce693..051f56705ec 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -36,7 +37,6 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.training.saving import saveable_object
-from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.types import core
 from tensorflow.python.util.tf_export import tf_export
@@ -434,7 +434,7 @@ class DistributedVarOp(object):
             self.traceback == o.traceback and self.type == o.type)
 
   def __hash__(self):
-    return hash((self.name, self.graph, self.traceback, self.type))
+    return hash((self.name, self.graph, tuple(self.traceback), self.type))
 
 
 class DistributedVariable(DistributedDelegate, variables_lib.Variable,
@@ -472,6 +472,41 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._policy = var_policy
 
+  def __deepcopy__(self, memo):
+    """Perform a deepcopy of the `DistributedVariable`.
+
+    Unlike the deepcopy of a regular tf.Variable, this keeps the original
+    strategy and devices of the `DistributedVariable`.  To avoid confusion
+    with the behavior of deepcopy on a regular `Variable` (which does
+    copy into new devices), we only allow a deepcopy of a `DistributedVariable`
+    within its originating strategy scope.
+
+    Args:
+      memo: The memoization object for `deepcopy`.
+
+    Returns:
+      A deep copy of the current `DistributedVariable`.
+
+    Raises:
+      RuntimeError: If trying to deepcopy into a different strategy.
+    """
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      new_values = []
+
+      for value in self._values:
+        with ops.device(value.device):
+          new_values.append(copy.deepcopy(value, memo))
+
+    copied_variable = type(self)(
+        strategy=self._distribute_strategy,
+        values=new_values,
+        aggregation=self._aggregation,
+        var_policy=copy.deepcopy(self._policy, memo))
+
+    memo[id(self)] = copied_variable
+
+    return copied_variable
+
   def _use_packed_variable(self):
     # Don't use packed variable when under a SaveContext to avoid explicit
     # device placement on variable consuming ops.
@@ -916,6 +951,13 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     return obj_map, resource_map
 
 
+# We extend from `saveable_object.SaveableObject` instead of
+# `saveable_object_util.ResourceVariableSaveable` since we need to read the
+# value of ONREAD variables when saving. `SaveableObject` provides a way to
+# specify the function to run to get the value of the variable or tensor at
+# saving time. We can use this for both ON_READ and ON_WRITE variables.
+# TODO(b/164586507): Consolidate ON_WRITE and ON_READ saving/restoring logic
+# if possible.
 class _DistributedVariableSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a DistributedVariable."""
 
@@ -935,26 +977,21 @@ class _DistributedVariableSaveable(saveable_object.SaveableObject):
         self._distributed_variable, tensor)
 
 
-class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
+class _MirroredSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a MirroredVariable."""
 
   def __init__(self, mirrored_variable, primary_variable, name):
     self._mirrored_variable = mirrored_variable
-    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
+    tensor, spec = values_util.get_on_write_saveable(self._mirrored_variable,
+                                                     primary_variable,
+                                                     name)
+    super(_MirroredSaveable, self).__init__(tensor, spec, name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    packed_var = self._mirrored_variable._packed_variable  # pylint: disable=protected-access
-    if packed_var is not None:
-      return control_flow_ops.group(
-          tuple(
-              values_util.assign_on_device(d, packed_var, tensor)
-              for d in packed_var.devices))
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in self._mirrored_variable.values))
+    return values_util.get_on_write_restore_ops(self._mirrored_variable,
+                                                tensor)
 
 
 class MirroredVariable(DistributedVariable, Mirrored):
@@ -1038,38 +1075,17 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
 
   def __init__(self, sync_on_read_variable, name):
     self._sync_on_read_variable = sync_on_read_variable
+    tensor, spec = values_util.get_on_read_saveable(
+        sync_on_read_variable, sync_on_read_variable._primary, name)
 
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      strategy = sync_on_read_variable._distribute_strategy  # pylint: disable=protected-access
-      return strategy.extended.read_var(sync_on_read_variable)
-
-    spec = saveable_object.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
-
-    super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
+    super(_SyncOnReadSaveable, self).__init__(tensor, spec, name)
 
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
     tensor, = restored_tensors
-    if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
-      # pylint: disable=protected-access
-      strategy = self._sync_on_read_variable._distribute_strategy
-      tensor = math_ops.cast(tensor / strategy.num_replicas_in_sync,
-                             self._sync_on_read_variable.dtype)
-      # pylint: enable=protected-access
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in self._sync_on_read_variable.values))
+    return values_util.get_on_read_restore_ops(
+        self._sync_on_read_variable, tensor,
+        self._sync_on_read_variable.aggregation)
 
 
 class SyncOnReadVariable(DistributedVariable):
@@ -1396,35 +1412,11 @@ class OnReadPolicy(VariablePolicy):
 
   def get_saveable(self, var, primary_var, name):
     """Create a saveable object for the given variable."""
-
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      strategy = var.distribute_strategy
-      return strategy.extended.read_var(var)
-
-    spec = saveable_object.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=var.dtype,
-        device=primary_var.device)
-
-    return tensor, [spec]
+    return values_util.get_on_read_saveable(var, primary_var, name)
 
   def get_restore_ops(self, var, tensor):
     """Restore the same value into all variables."""
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
-    if self._aggregation == vs.VariableAggregation.SUM:
-      strategy = var._distribute_strategy  # pylint: disable=protected-access
-      num_replicas_in_sync = strategy.num_replicas_in_sync
-      tensor = math_ops.cast(tensor / num_replicas_in_sync, var.dtype)
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in var.values))
+    return values_util.get_on_read_restore_ops(var, tensor, self._aggregation)
 
 
 class AutoPolicy(VariablePolicy):
@@ -1448,7 +1440,7 @@ class AutoPolicy(VariablePolicy):
   def _get_cross_replica(self, var):
     # Return identity, to avoid directly exposing the variable to the user and
     # allowing it to be modified by mistake.
-    return array_ops.identity(Mirrored._get_cross_replica(var))  # pylint: disable=protected-access
+    return array_ops.identity(var._get_on_device_or_primary())  # pylint: disable=protected-access
 
   def _update_replica(self, var, update_fn, value, **kwargs):
     return update_fn(var._get_on_device_or_primary(), value, **kwargs)  # pylint: disable=protected-access
@@ -1509,14 +1501,11 @@ class AutoPolicy(VariablePolicy):
                                       name=name)
 
   def get_saveable(self, var, primary_var, name):
-    del var, name
-    return primary_var, ""
+    """Saveable ops for AUTO variables."""
+    return values_util.get_on_write_saveable(var, primary_var, name)
 
   def get_restore_ops(self, var, tensor):
-    return control_flow_ops.group(
-        tuple(
-            values_util.assign_on_device(v.device, v, tensor)
-            for v in var.values))
+    return values_util.get_on_write_restore_ops(var, tensor)
 
 
 class OnWritePolicy(AutoPolicy):
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 899134f0bff..02a9926ea18 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import test_util as ds_test_util
 from tensorflow.python.distribute import tpu_strategy
@@ -80,15 +81,23 @@ def _make_mirrored_val(init_val=5.0):
   return values_lib.Mirrored(v)
 
 
-def _make_mirrored():
+def _make_mirrored(distribution=None):
   v = []
-  devices = ["/device:GPU:0", "/device:CPU:0"]
+  if distribution:
+    devices = distribution.extended.worker_devices
+  else:
+    devices = ["/device:GPU:0", "/device:CPU:0"]
   for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
     with ops.device(d):
-      v.append(variable_scope.get_variable(
-          name=n, initializer=init, use_resource=True))
-  mirrored = values_lib.MirroredVariable(
-      None, v, variable_scope.VariableAggregation.SUM)
+      v.append(
+          variable_scope.get_variable(
+              name=n, initializer=init, use_resource=True))
+
+  if (distribution is not None) and isinstance(distribution, _TPU_STRATEGIES):
+    var_cls = tpu_values.TPUMirroredVariable
+  else:
+    var_cls = values_lib.MirroredVariable
+  mirrored = var_cls(distribution, v, variable_scope.VariableAggregation.SUM)
   return mirrored
 
 
@@ -408,7 +417,7 @@ class DistributedDelegateTest(test.TestCase):
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
             strategy_combinations.tpu_strategy,
             strategy_combinations.tpu_strategy_packed_var,
-            strategy_combinations.central_storage_strategy_with_two_gpus,
+            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
             strategy_combinations.multi_worker_mirrored_2x1_cpu,
             strategy_combinations.multi_worker_mirrored_2x1_gpu,
             strategy_combinations.multi_worker_mirrored_2x2_gpu
@@ -422,7 +431,8 @@ class DistributedDelegateTest(test.TestCase):
             variables_lib.VariableAggregation.SUM,
             variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
         ],
-        mode=["graph", "eager"]))
+        mode=["graph", "eager"],
+        use_var_policy=[True, False]))
 class DistributedVariableTest(test.TestCase, parameterized.TestCase):
 
   def testExtendsVariable(self, distribution, synchronization, aggregation):
@@ -533,6 +543,42 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
     # In replica context.
     distribution.run(assert_is_tensor_like, args=(v,))
 
+  def testDeepCopy(self, distribution, synchronization,
+                   aggregation):
+    if not context.executing_eagerly():
+      self.skipTest("deepcopy only supported in eager mode")
+
+    with distribution.scope():
+      v = variables_lib.Variable(
+          0., synchronization=synchronization, aggregation=aggregation)
+      in_dist_copy = copy.deepcopy(v)
+
+    out_dist_copy = copy.deepcopy(v)
+
+    def assert_is_deep_copy(v1, v2):
+      self.assertIsInstance(v2, type(v1))
+      self.assertEqual(v1.aggregation, v2.aggregation)
+      self.assertEqual(v1.distribute_strategy, v2.distribute_strategy)
+      if isinstance(v1, ps_values.AggregatingVariable):
+        self.assertIsInstance(v2.get(), type(v1.get()))
+        self.assertNotEqual(id(v1.get()), id(v2.get()))
+      else:
+        if v1._policy:
+          self.assertNotEqual(id(v1._policy), id(v2._policy))  # pylint: disable=protected-access
+        else:
+          self.assertEqual(id(v1._policy), id(v2._policy))  # pylint: disable=protected-access
+        self.assertEqual(len(v1.values), len(v2.values))
+        for (v1v, v2v) in zip(v1.values, v2.values):
+          self.assertEqual(v1v.device, v2v.device)
+          self.assertNotEqual(id(v1v), id(v2v))
+          self.assertAllEqual(self.evaluate(v1.values),
+                              self.evaluate(v2.values))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    if not isinstance(distribution.extended, tpu_strategy.TPUExtended):
+      distribution.run(assert_is_deep_copy, args=(v, in_dist_copy))
+      distribution.run(assert_is_deep_copy, args=(v, out_dist_copy))
+
   def testAssignSignature(self, distribution, synchronization, aggregation):
     # This test verifies assign*() can be called in the same way as normal
     # variables.
@@ -866,6 +912,9 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(v.dtype, mirrored.dtype)
     self.assertEqual(v.shape, mirrored.shape)
 
+
+class MirroredVariableSaveRestoreTest(test.TestCase, parameterized.TestCase):
+
   def _assign_mirrored(self, v, new):
     for var, n in zip(v.values, new):
       self.evaluate(var.assign(n))
@@ -880,37 +929,10 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     save_path, _ = self._save_return_saver(sess, var)
     return save_path
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreMirroredOneGraph(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      # Graph mode can work without GPU because the Placer "moves" the
-      # variable to a CPU. In other words, if there is no GPU available, but
-      # user requested to create a variable on GPU, Placer will ignore the
-      # user request and assign the VarHandleOp to CPU. This requires
-      # soft_placement, which is on by default.
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    with self.cached_session(config=self.config) as sess:
-      mirrored = _make_mirrored()
-      v = mirrored.values
-
-      # Overwrite the initial values.
-      self._assign_mirrored(mirrored, [3., 4.])
-
-      # Saves the current value of v[0], 3.
-      save_path, saver = self._save_return_saver(sess, mirrored)
-
-      # Change the values between save and restore.
-      self._assign_mirrored(mirrored, [5., 6.])
-
-      # Restores the saved value of 3. to both variables.
-      saver.restore(sess, save_path)
-      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
-
-  def _save_mirrored(self):
+  def _save_mirrored(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      mirrored = _make_mirrored()
+      mirrored = _make_mirrored(distribution)
 
       # Overwrite the initial values.
       self._assign_mirrored(mirrored, [3., 4.])
@@ -952,10 +974,10 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual(3., self.evaluate(var))
 
-  def _restore_mirrored(self, save_path):
+  def _restore_mirrored(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
-      mirrored = _make_mirrored()
+      mirrored = _make_mirrored(distribution)
       v = mirrored.values
 
       # Overwrite the initial values.
@@ -966,8 +988,27 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveMirroredRestoreMirrored(self):
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveAndRestoreMirroredOneGraph(self, distribution):
+    with self.cached_session() as sess:
+      mirrored = _make_mirrored(distribution)
+      v = mirrored  .values
+
+      # Overwrite the initial values.
+      self._assign_mirrored(mirrored, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path, saver = self._save_return_saver(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(mirrored, [5., 6.])
+
+      # Restores the saved value of 3. to both variables.
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveMirroredRestoreMirrored(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       # Graph mode can work without GPU because the Placer "moves" the
       # variable to a CPU. In other words, if there is no GPU available, but
@@ -976,11 +1017,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       # soft_placement, which is on by default.
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    save_path = self._save_mirrored()
-    self._restore_mirrored(save_path)
+    save_path = self._save_mirrored(distribution)
+    self._restore_mirrored(save_path, distribution)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveMirroredRestoreNormal(self):
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveMirroredRestoreNormal(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       # Graph mode can work without GPU because the Placer "moves" the
       # variable to a CPU. In other words, if there is no GPU available, but
@@ -989,11 +1030,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       # soft_placement, which is on by default.
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    save_path = self._save_mirrored()
+    save_path = self._save_mirrored(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreMirrored(self):
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
+  def testSaveNormalRestoreMirrored(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       # Graph mode can work without GPU because the Placer "moves" the
       # variable to a CPU. In other words, if there is no GPU available, but
@@ -1003,7 +1044,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       self.skipTest("A GPU is not available for this test in eager mode.")
 
     save_path = self._save_normal()
-    self._restore_mirrored(save_path)
+    self._restore_mirrored(save_path, distribution)
 
 
 _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index 535351e6563..1ad56fcbd27 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -28,6 +28,73 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model import save_options
+from tensorflow.python.training.saving import saveable_object
+
+
+def get_on_write_saveable(var, primary_var, name):
+  """Return saveable spec for AUTO and ON_WRITE variables."""
+  # We use a callable so that we don't have to evaluate this expression
+  # in the case where we are trying to restore instead of save.
+  def tensor():
+    strategy = var.distribute_strategy
+    return strategy.extended.read_var(var)
+
+  spec = saveable_object.SaveSpec(
+      tensor=tensor,
+      slice_spec="",
+      name=name,
+      dtype=var.dtype,
+      device=primary_var.device)
+
+  return tensor, [spec]
+
+
+def get_on_write_restore_ops(var, tensor):
+  """Return restore ops for AUTO and ON_WRITE variables."""
+  packed_var = var._packed_variable  # pylint: disable=protected-access
+  if packed_var is not None:
+    return control_flow_ops.group(
+        tuple(
+            assign_on_device(d, packed_var, tensor)
+            for d in packed_var.devices))
+  return control_flow_ops.group(
+      tuple(
+          assign_on_device(v.device, v, tensor)
+          for v in var.values))
+
+
+def get_on_read_saveable(var, primary_var, name):
+  """Return saveables for ON_READ variable."""
+
+  # We use a callable so that we don't have to evaluate this expression
+  # in the case where we are trying to restore instead of save.
+  def tensor():
+    strategy = var.distribute_strategy
+    return strategy.extended.read_var(var)
+
+  spec = saveable_object.SaveSpec(
+      tensor=tensor,
+      slice_spec="",
+      name=name,
+      dtype=var.dtype,
+      device=primary_var.device)
+
+  return tensor, [spec]
+
+
+def get_on_read_restore_ops(var, tensor, aggregation):
+  """Return restore ops for ON_READ variables."""
+  # To preserve the sum across save and restore, we have to divide the
+  # total across all devices when restoring a variable that was summed
+  # when saving.
+  if aggregation == vs.VariableAggregation.SUM:
+    strategy = var.distribute_strategy
+    tensor = math_ops.cast(tensor / strategy.num_replicas_in_sync,
+                           var.dtype)
+  return control_flow_ops.group(
+      tuple(
+          assign_on_device(v.device, v, tensor)
+          for v in var.values))
 
 
 # Utility function that indicates if you are in an UpdateContext when running
diff --git a/tensorflow/python/distribute/vars_test.py b/tensorflow/python/distribute/vars_test.py
index a8605a3f2da..ba77384a83a 100644
--- a/tensorflow/python/distribute/vars_test.py
+++ b/tensorflow/python/distribute/vars_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import itertools
 
+import uuid
 from absl.testing import parameterized
 
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
@@ -41,6 +42,8 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.training import checkpoint_management as ckpt_manager
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)
@@ -78,22 +81,6 @@ def strategy_with_var_policy():
 
 class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_one_gpu,
-          ],
-          mode=["graph"]))
-  def testFetchAMirroredVariable(self, distribution):
-    with self.session(graph=ops.Graph()) as sess, distribution.scope():
-      with ops.device("/device:GPU:0"):
-        v = variable_scope.get_variable(
-            name="v", initializer=1., use_resource=True)
-      mirrored = values.MirroredVariable(
-          distribution, (v,), variable_scope.VariableAggregation.MEAN)
-      sess.run(variables_lib.global_variables_initializer())
-      sess.run({"complicated": mirrored})
-
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAssign(self, distribution, experimental_run_tf_function):
 
@@ -330,7 +317,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def assign():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       return v.assign(ctx.replica_id_in_sync_group)
 
     # disallow assign() with distributed value in replica context.
@@ -402,7 +389,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def assign():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       return v.assign(math_ops.cast(replica_id, dtypes.float32))
     per_replica_results = self.evaluate(distribution.experimental_local_results(
@@ -458,6 +445,60 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
         distribution.experimental_local_results(distribution.run(add)))
     self.assertAllEqual([2, 2], per_replica_results)
 
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ],
+          mode=["eager"],
+          use_var_policy=[True, False]))
+  def testSaveAndRestoreOnWrite(self, strategy):
+    aggregation = [
+        variable_scope.VariableAggregation.NONE,
+        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
+        variable_scope.VariableAggregation.SUM,
+        variable_scope.VariableAggregation.MEAN
+    ]
+    for agg in aggregation:
+      v_normal_restore = variables_lib.Variable(1.0)
+      v_normal_save = variables_lib.Variable(3.0)
+      with strategy.scope():
+        v_on_write = variables_lib.Variable(2.0, aggregation=agg)
+
+        # Save ONWRITE Restore ONWRITE
+        # Save
+        ckpt = trackable_utils.Checkpoint(var=v_on_write)
+        manager = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager.save()
+        # Restore
+        ckpt.restore(manager.latest_checkpoint)
+        self.assertEqual(2.0, self.evaluate(v_on_write._values[0]))
+        self.assertEqual(2.0, self.evaluate(v_on_write.read_value()))
+
+        # Save Mirrored Restore Normal
+        # We've already saved Mirrored, so we only need to restore normal
+        ckpt_normal = trackable_utils.Checkpoint(var=v_normal_restore)
+        ckpt_normal.restore(manager.latest_checkpoint)
+        self.assertEqual(2.0, self.evaluate(v_on_write._values[0]))
+        self.assertEqual(2.0, self.evaluate(v_normal_restore.read_value()))
+
+        # Save Normal Restore Mirrored
+        # Save
+        ckpt = trackable_utils.Checkpoint(var=v_normal_save)
+        manager_2 = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckptckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager_2.save()
+        # Restore
+        ckpt_on_write = trackable_utils.Checkpoint(var=v_on_write)
+        ckpt_on_write.restore(manager_2.latest_checkpoint)
+        self.assertEqual(3.0, self.evaluate(v_on_write._values[0]))
+        self.assertEqual(3.0, self.evaluate(v_on_write.read_value()))
+
 
 @combinations.generate(
     combinations.combine(
@@ -468,7 +509,7 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
         use_var_policy=[True, False]))
 class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
-  def testScatterSub(self, distribution, use_var_policy):
+  def testScatterSub(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [0., 0., 0.], aggregation=variables_lib.VariableAggregation.MEAN)
@@ -476,7 +517,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_sub():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.stack([
@@ -492,7 +533,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_sub)))
     self.assertAllEqual([[0., -1., -1.], [0., -1., -1.]], per_replica_results)
 
-  def testScatterAdd(self, distribution, use_var_policy):
+  def testScatterAdd(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -500,7 +541,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_add():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.stack([replica_id, replica_id + 1]),
@@ -513,7 +554,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_add)))
     self.assertAllEqual([[0, 2, 2], [0, 2, 2]], per_replica_results)
 
-  def testScatterDiv(self, distribution, use_var_policy):
+  def testScatterDiv(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [1, 6, 1], aggregation=variables_lib.VariableAggregation.SUM)
@@ -521,7 +562,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_div():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.reshape(replica_id + 2, [1]),
@@ -534,7 +575,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_div)))
     self.assertAllEqual([[0, 2, 1], [0, 2, 1]], per_replica_results)
 
-  def testScatterMul(self, distribution, use_var_policy):
+  def testScatterMul(self, distribution):
     with distribution.scope():
       v = variables_lib.Variable(
           [2., 1., 1.], aggregation=variables_lib.VariableAggregation.MEAN)
@@ -542,7 +583,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def scatter_mul():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       value = indexed_slices.IndexedSlices(
           values=array_ops.reshape(
@@ -556,7 +597,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_mul)))
     self.assertAllClose([[2., 1.5, 1.], [2., 1.5, 1.]], per_replica_results)
 
-  def testScatterMin(self, distribution, use_var_policy):
+  def testScatterMin(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [0, 2, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -583,7 +624,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_min, args=(v2,))))
     self.assertAllClose([[0, 1, 0], [0, 1, 0]], per_replica_results)
 
-  def testScatterMax(self, distribution, use_var_policy):
+  def testScatterMax(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -610,7 +651,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_max, args=(v2,))))
     self.assertAllClose([[1, 0, 0], [1, 0, 0]], per_replica_results)
 
-  def testScatterUpdate(self, distribution, use_var_policy):
+  def testScatterUpdate(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [0, 0, 0], aggregation=variables_lib.VariableAggregation.SUM)
@@ -637,7 +678,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
             distribution.run(scatter_update, args=(v2,))))
     self.assertAllClose([[0, 3, 0], [0, 3, 0]], per_replica_results)
 
-  def testScatterOpsInCrossReplicaContext(self, distribution, use_var_policy):
+  def testScatterOpsInCrossReplicaContext(self, distribution):
     with distribution.scope():
       v1 = variables_lib.Variable(
           [1, 1, 1], aggregation=variables_lib.VariableAggregation.SUM)
@@ -659,8 +700,7 @@ class OnWriteVariableSyncScatterTests(test.TestCase, parameterized.TestCase):
 class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssign(self, distribution, experimental_run_tf_function,
-                 use_var_policy):
+  def testAssign(self, distribution, experimental_run_tf_function):
 
     def assign(fn, v, update_value, cross_replica):
       update_fn = lambda: getattr(v, fn)(update_value)
@@ -702,8 +742,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                             self.evaluate(array_ops.ones_like(component)))
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssignOnReadVar(self, distribution, experimental_run_tf_function,
-                          use_var_policy):
+  def testAssignOnReadVar(self, distribution, experimental_run_tf_function):
 
     with distribution.scope():
       v_to_assign = variable_scope.variable(
@@ -764,8 +803,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                               self.evaluate(component.read_value()))
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
-  def testAssignPerReplicaVal(self, distribution, experimental_run_tf_function,
-                              use_var_policy):
+  def testAssignPerReplicaVal(self, distribution, experimental_run_tf_function):
 
     if isinstance(distribution, _TPU_STRATEGIES):
       self.skipTest("Assigning PerReplica values is not supported. See"
@@ -822,8 +860,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAssignDtypeConversion(self, distribution,
-                                experimental_run_tf_function,
-                                use_var_policy):
+                                experimental_run_tf_function):
 
     def assign(fn, v, update_value, cross_replica):
       update_fn = lambda: getattr(v, fn)(update_value)
@@ -865,7 +902,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                             self.evaluate(array_ops.ones_like(component)))
 
   @combinations.generate(strategy_with_var_policy())
-  def testAssignWithAggregationSum(self, distribution, use_var_policy):
+  def testAssignWithAggregationSum(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
           0.,
@@ -878,7 +915,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
                           self.evaluate(array_ops.ones_like(component)))
 
   @combinations.generate(strategy_with_var_policy())
-  def testAssignAddSubWithAggregationSum(self, distribution, use_var_policy):
+  def testAssignAddSubWithAggregationSum(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
           0.,
@@ -894,8 +931,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testReadValueInReplicaContext(self, distribution,
-                                    experimental_run_tf_function,
-                                    use_var_policy):
+                                    experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.NONE,
         variables_lib.VariableAggregation.SUM,
@@ -921,8 +957,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testReadValueInCrossReplicaContext(self, distribution,
-                                         experimental_run_tf_function,
-                                         use_var_policy):
+                                         experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
         variables_lib.VariableAggregation.MEAN,
@@ -940,7 +975,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
 
       def assign(v=v):
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = ds_context.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -967,8 +1002,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
   # respected on GPUs.
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def disable_testAllReduce(self, distribution,
-                            experimental_run_tf_function,
-                            use_var_policy):
+                            experimental_run_tf_function):
     with distribution.scope():
       v = variable_scope.variable(
           2.,
@@ -977,7 +1011,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     self.evaluate(variables_lib.global_variables_initializer())
 
     def all_reduce():
-      ctx = distribution_strategy_context.get_replica_context()
+      ctx = ds_context.get_replica_context()
       replica_id = ctx.replica_id_in_sync_group
       return ctx.all_reduce("SUM", v) + math_ops.cast(replica_id,
                                                       dtypes.float32)
@@ -995,8 +1029,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(strategy_and_run_tf_function_combinations())
   def testAssignPerReplicaBeforeRead(self, distribution,
-                                     experimental_run_tf_function,
-                                     use_var_policy):
+                                     experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
         variables_lib.VariableAggregation.MEAN,
@@ -1011,7 +1044,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
 
       def assign(var=v):
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = ds_context.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return var.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1026,8 +1059,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(per_replica_results, tuple(expected_result))
 
   @combinations.generate(strategy_with_var_policy())
-  def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution,
-                                                            use_var_policy):
+  def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
           0.,
@@ -1039,8 +1071,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
       self.evaluate(v.read_value())
 
   @combinations.generate(strategy_with_var_policy())
-  def testInitializedToSameValueInsideEagerRun(self, distribution,
-                                               use_var_policy):
+  def testInitializedToSameValueInsideEagerRun(self, distribution):
     if not context.executing_eagerly(): self.skipTest("eager only")
 
     v = [None]
@@ -1060,7 +1091,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(vals[0], vals[1])
 
   @combinations.generate(strategy_with_var_policy())
-  def testOperatorOverride(self, distribution, use_var_policy):
+  def testOperatorOverride(self, distribution):
 
     with distribution.scope():
       v = variable_scope.variable(
@@ -1071,7 +1102,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
 
       @def_function.function
       def assign():
-        ctx = distribution_strategy_context.get_replica_context()
+        ctx = ds_context.get_replica_context()
         replica_id = ctx.replica_id_in_sync_group
         return v.assign(math_ops.cast(replica_id, dtypes.float32))
 
@@ -1088,6 +1119,73 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
           distribution.experimental_local_results(distribution.run(add)))
       self.assertAllEqual([1, 2], per_replica_results)
 
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ],
+          mode=["eager"],
+          use_var_policy=[True, False]))
+  def testSaveAndRestoreOnRead(self, strategy):
+    aggregation = [variable_scope.VariableAggregation.SUM,
+                   variable_scope.VariableAggregation.MEAN]
+    for agg in aggregation:
+      v_normal_restore = variables_lib.Variable(1.0)
+      v_normal_save = variables_lib.Variable(2.0)
+
+      with strategy.scope():
+        v_on_read = variables_lib.Variable(
+            1.0, synchronization=variable_scope.VariableSynchronization.ON_READ,
+            aggregation=agg)
+
+        @def_function.function
+        def assign_fn():
+          cluster_resolver = strategy.cluster_resolver
+          replica_ctx = ds_context.get_replica_context()
+          if ((cluster_resolver and cluster_resolver.task_type == "worker") or
+              math_ops.equal(replica_ctx.replica_id_in_sync_group,
+                             constant_op.constant(1))):
+            v_on_read.assign(3.)  # pylint:disable=cell-var-from-loop
+          else:
+            v_on_read.assign(4.)  # pylint:disable=cell-var-from-loop
+
+        strategy.run(assign_fn)
+
+        # Save ONREAD, restore ONREAD
+        # Saves v[0] + v[1] = 7 for SUM and 3.5 for MEAN.
+        ckpt = trackable_utils.Checkpoint(var=v_on_read)
+        manager = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager.save()
+        # Restores a value of 7/2 = 3.5 for SUM and 3.5 for MEAN.
+        ckpt.restore(manager.latest_checkpoint)
+        self.assertEqual(3.5, self.evaluate(v_on_read._values[0]))
+
+        # Save ONREAD, restore normal
+        ckpt_normal = trackable_utils.Checkpoint(var=v_normal_restore)
+        ckpt_normal.restore(manager.latest_checkpoint)
+        if agg == variable_scope.VariableAggregation.SUM:
+          self.assertEqual(7.0, self.evaluate(v_normal_restore.read_value()))
+        else:
+          self.assertEqual(3.5, self.evaluate(v_normal_restore.read_value()))
+
+        # Save normal, restore ONREAD
+        ckpt = trackable_utils.Checkpoint(var=v_normal_save)
+        manager = ckpt_manager.CheckpointManager(
+            ckpt, "/tmp/ckpt_" + str(uuid.uuid4()), max_to_keep=None)
+        manager.save()
+        # Restores a value of 2/2 = 1.0 for SUM and 2.0 for MEAN.
+        ckpt_on_read = trackable_utils.Checkpoint(var=v_on_read)
+        ckpt_on_read.restore(manager.latest_checkpoint)
+        if agg == variable_scope.VariableAggregation.SUM:
+          self.assertEqual(1.0, self.evaluate(v_on_read._values[0]))
+        else:
+          self.assertEqual(2.0, self.evaluate(v_on_read._values[0]))
+
 
 @combinations.generate(
     combinations.combine(
@@ -1103,7 +1201,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
         use_var_policy=[True, False]))
 class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
 
-  def testScatterSub(self, distribution, aggregation, use_var_policy):
+  def testScatterSub(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [1., 1., 1.],
@@ -1121,7 +1219,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_sub, args=(delta,)))
 
-  def testScatterAdd(self, distribution, aggregation, use_var_policy):
+  def testScatterAdd(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [1., 1., 1.],
@@ -1139,7 +1237,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_add, args=(delta,)))
 
-  def testScatterDiv(self, distribution, aggregation, use_var_policy):
+  def testScatterDiv(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [2., 6., 1.],
@@ -1157,7 +1255,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_div, args=(delta,)))
 
-  def testScatterMul(self, distribution, aggregation, use_var_policy):
+  def testScatterMul(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [2., 1., 1.],
@@ -1175,7 +1273,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_mul, args=(delta,)))
 
-  def testScatterMin(self, distribution, aggregation, use_var_policy):
+  def testScatterMin(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [3., 4., 5.],
@@ -1193,7 +1291,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
 
-  def testScatterMax(self, distribution, aggregation, use_var_policy):
+  def testScatterMax(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [3., 4., 5.],
@@ -1211,7 +1309,7 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(NotImplementedError):
       self.evaluate(distribution.run(v.scatter_max, args=(delta,)))
 
-  def testScatterUpdate(self, distribution, aggregation, use_var_policy):
+  def testScatterUpdate(self, distribution, aggregation):
     with distribution.scope():
       v = variables_lib.Variable(
           [0., 0., 0.],
@@ -1231,4 +1329,4 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 9a54ee3d628..f08790348a2 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -959,6 +959,7 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
+        ":cancellation",
         ":context",
         ":def_function",
         ":remote",
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
new file mode 100644
index 00000000000..8e147d50d9e
--- /dev/null
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -0,0 +1,21 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cuda_py_test(
+    name = "kpi_benchmark_test",
+    size = "medium",
+    srcs = ["kpi_benchmark_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_windows",  #  b/141617449
+        "optonly",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py_no_contrib",
+        "//tensorflow/python/eager:benchmarks_test_base",
+    ],
+)
diff --git a/tensorflow/python/eager/benchmarks/kpi_benchmark_test.py b/tensorflow/python/eager/benchmarks/kpi_benchmark_test.py
new file mode 100644
index 00000000000..22a70e199f9
--- /dev/null
+++ b/tensorflow/python/eager/benchmarks/kpi_benchmark_test.py
@@ -0,0 +1,121 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""KPI Benchmarks for low-level eager execution primitives.
+
+This is a suite of full end-to-end integration benchmakr for low-level eager
+execution APIs. Also tracks them as KPI Traceme.
+
+To run CPU benchmarks:
+  bazel run -c opt kpi_benchmarks_test -- --benchmarks=.
+
+To run GPU benchmarks:
+  bazel run --config=cuda -c opt --copt="-mavx" kpi_benchmarks_test -- \
+    --benchmarks=.
+
+To run a subset of benchmarks using --benchmarks flag.
+--benchmarks: the list of benchmarks to run. The specified value is interpreted
+as a regular expression and any benchmark whose name contains a partial match
+to the regular expression is executed.
+e.g. --benchmarks=".*matmul*." will run all matmul related benchmarks.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gc
+import time
+
+import tensorflow as tf
+
+from tensorflow.python.eager import benchmarks_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.profiler import trace
+
+NUM_ITERATIONS = 30000
+
+
+def _run_benchmark(func, num_iters, execution_mode=None):
+  ctx = context.context()
+  with context.execution_mode(execution_mode):
+    # call func to warm up
+    func()
+    if execution_mode == context.ASYNC:
+      ctx.executor.wait()
+    start = time.time()
+    for _ in range(num_iters):
+      func()
+    if execution_mode == context.ASYNC:
+      ctx.executor.wait()
+    end = time.time()
+
+    return end - start
+
+
+class KpiBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
+  """A Collection of KPI benchmarks."""
+
+  def _get_benchmark_name(self):
+    return self._get_name()
+
+  def _run(self, func, num_iters):
+    gc.disable()
+    gc.collect()
+    self.run_report(_run_benchmark, func, num_iters)
+    gc.enable()
+
+  def benchmark_tf_constant_2x2(self):
+    x = [[1., 2.], [3., 4.]]
+
+    def fn():
+      with trace.Trace("tf.constant-2x2"):
+        tf.constant(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+  def benchmark_tf_convert_to_tensor_2x2(self):
+    x = [[1., 2.], [3., 4.]]
+
+    def fn():
+      with trace.Trace("tf.convert_to_tensor-2x2"):
+        tf.convert_to_tensor(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+  def benchmark_tf_nn_relu_2x2(self):
+    x = tf.constant([[1., 2.], [3., 4.]])
+
+    def fn():
+      with trace.Trace("tf.nn.relu-2x2"):
+        tf.nn.relu(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+  def benchmark_tf_function_invocation_identity(self):
+    x = tf.constant([[1., 2.], [3., 4.]])
+
+    @tf.function
+    def identity(x):
+      return x
+
+    def fn():
+      with trace.Trace("tf.function-identity"):
+        identity(x)
+
+    self._run(fn, NUM_ITERATIONS)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 667d3f1cff4..fd50f789a6a 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -253,32 +253,26 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     tensor_b = constant_op.constant([[24, 24], [24, 24]])
     self._benchmark_add(tensor_a, tensor_b)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_float_tensor_from_list_CPU(self):
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, CPU)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_float_tensor_from_np_array_CPU(self):
     self._benchmark_create_tensor(
         np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
         CPU)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_int32_tensor_from_list_CPU(self):
     self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, CPU)
 
-  @test_util.disable_tfrt("convert_to_tensor not handled")
   def benchmark_create_int32_tensor_from_np_array_CPU(self):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, CPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_float_tensor_from_list_GPU(self):
     if not context.num_gpus():
       return
     self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, GPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_float_tensor_from_np_array_GPU(self):
     if not context.num_gpus():
       return
@@ -286,14 +280,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
         GPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_int32_tensor_from_list_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
       return
     self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, GPU)
 
-  @test_util.disable_tfrt("no gpu support")
   def benchmark_create_int32_tensor_from_np_array_GPU(self):
     # int32's are kept on host memory even when executing on GPU.
     if not context.num_gpus():
@@ -301,17 +293,14 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
 
-  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_literal(self):
     func = lambda: constant_op.constant([3.0])[0]
     self._run(func, 30000)
 
-  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_tensor(self):
     func = lambda idx=constant_op.constant(0): constant_op.constant([3.0])[idx]
     self._run(func, 30000)
 
-  @test_util.disable_tfrt("strided slice not supported")
   def benchmark_index_tensor_with_np_array(self):
     func = lambda idx=np.array(0): constant_op.constant([3.0])[idx]
     self._run(func, 30000)
@@ -1484,6 +1473,19 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def benchmark_tf_nest_flatten_none(self):
+    def fn():
+      nest.flatten(None)
+
+    self._run(fn, 100000)
+
+  def benchmark_tf_nest_flatten(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+    def fn():
+      nest.flatten(nested)
+
+    self._run(fn, 100000)
+
   def benchmark_tf_nn_convolution_overhead(self):
     inputs = array_ops.ones((1, 1, 1, 1))
     filters = array_ops.ones((1, 1, 1, 1))
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index d7f412b2408..a2840b34e64 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -431,7 +431,6 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertFalse(switch.is_building_function)
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Resolve not implemented yet.')
   def testInt32GPU(self):
     with ops.device('gpu:0'):
       xent = nn_ops.sparse_softmax_cross_entropy_with_logits(
@@ -485,7 +484,6 @@ class TFETest(test_util.TensorFlowTestCase):
       self.assertAllEqual(t.numpy(), 10.0)
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Resolve not implemented yet.')
   def testDevicePlacementEnforcesConsistency(self):
     cpu = context.device('cpu:0')
     gpu = context.device('gpu:0')
@@ -528,7 +526,6 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertEqual(3, result)
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Resolve not implemented yet.')
   def testResourceTensorPlacement(self):
     with context.device('gpu:0'):
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -568,7 +565,7 @@ class TFETest(test_util.TensorFlowTestCase):
     context.context().executor.clear_error()
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('TensorHandleInterface::Resolve() not implemented.')
+  @test_util.disable_tfrt('Device placement not implemented.')
   def testCopyScope(self):
     constant = constant_op.constant(1.0)
     with ops.device('gpu:0'):
@@ -609,7 +606,6 @@ class TFETest(test_util.TensorFlowTestCase):
     async_executor.wait()
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Resolve not implemented yet.')
   def testNumpyForceCPU(self):
     cpu = constant_op.constant([[1., 2.], [3., 4.]])
     c2g = cpu.gpu()
@@ -692,7 +688,7 @@ class TFETest(test_util.TensorFlowTestCase):
           attrs=('T', dtypes.int32.as_datatype_enum))[0]
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Resolve not implemented yet.')
+  @test_util.disable_tfrt('Device placement not implemented yet.')
   def testMatMulGPU(self):
     three = constant_op.constant([[3.]]).gpu()
     five = constant_op.constant([[5.]]).gpu()
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 3ba95b0076d..3199747de53 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -593,6 +593,8 @@ class Function(object):
     """Creates a defun wrapped inside a variable creator scope."""
 
     weak_wrapped_fn = None
+    compile_with_xla = self._experimental_compile
+
     def wrapped_fn(*args, **kwds):
       """Wraps `self._python_function` in a variable creator scope."""
       # We register a variable creator with reduced priority. If an outer
@@ -607,10 +609,22 @@ class Function(object):
       # and so variable initializers can't depend on function arguments. This is
       # better than the alternative, tracing the initialization graph but giving
       # the user a variable type they didn't want.
-      with ops.get_default_graph()._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
+      default_graph = ops.get_default_graph()
+      with default_graph._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
         # __wrapped__ allows AutoGraph to swap in a converted function. We give
         # the function a weak reference to itself to avoid a reference cycle.
-        return weak_wrapped_fn().__wrapped__(*args, **kwds)
+        if compile_with_xla and \
+            not control_flow_util.GraphOrParentsInXlaContext(default_graph):
+          xla_context = control_flow_ops.XLAControlFlowContext()
+          try:
+            xla_context.Enter()
+            out = weak_wrapped_fn().__wrapped__(*args, **kwds)
+          finally:
+            xla_context.Exit()
+        else:
+          out = weak_wrapped_fn().__wrapped__(*args, **kwds)
+        return out
+
     weak_wrapped_fn = weakref.ref(wrapped_fn)
 
     return self._defun(tf_decorator.make_decorator(
@@ -769,23 +783,8 @@ class Function(object):
 
     tracing_count = self._get_tracing_count()
     with trace.Trace(self._name) as tm:
-      if self._experimental_compile and (
-          not control_flow_util.GraphOrParentsInXlaContext(
-              ops.get_default_graph())):
-        # V2 control flow relies on XLAControlFlowContext to generate a
-        # XLA-compatible function graph. If the function is already called
-        # inside an XLA context, we don't create nested XLA context.
-        compiler = "xla"
-        xla_context = control_flow_ops.XLAControlFlowContext()
-        try:
-          xla_context.Enter()
-          result = self._call(*args, **kwds)
-        finally:
-          xla_context.Exit()
-      else:
-        compiler = "nonXla"
-        result = self._call(*args, **kwds)
-
+      result = self._call(*args, **kwds)
+      compiler = "xla" if self._experimental_compile else "nonXla"
       new_tracing_count = self._get_tracing_count()
       without_tracing = (tracing_count == new_tracing_count)
       execution_mode = "notTraced" if without_tracing else "traced"
@@ -846,13 +845,14 @@ class Function(object):
         # stateless function.
         return self._stateless_fn(*args, **kwds)
     else:
-      _, _, flat_args, flat_kwds = \
+      _, _, _, filtered_flat_args = \
           self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
               *args, **kwds)
       # If we did not create any variables the trace we have is good enough.
-      return self._concrete_stateful_fn._filtered_call(flat_args, flat_kwds)  # pylint: disable=protected-access
+      return self._concrete_stateful_fn._call_flat(
+          filtered_flat_args, self._concrete_stateful_fn.captured_inputs)  # pylint: disable=protected-access
 
-    def fn_with_cond(inner_args, inner_kwds, inner_flat_args, inner_flat_kwds):
+    def fn_with_cond(inner_args, inner_kwds, inner_filtered_flat_args):
       """Conditionally runs initialization if it's needed."""
       condition = True
       for wr in self._created_variables:
@@ -901,17 +901,17 @@ class Function(object):
           condition,
           lambda: self._stateless_fn(*inner_args, **inner_kwds),
           functools.partial(
-              self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
-              inner_flat_args,
-              inner_flat_kwds))
+              self._concrete_stateful_fn._call_flat,  # pylint: disable=protected-access
+              inner_filtered_flat_args,
+              captured_inputs=self._concrete_stateful_fn.captured_inputs))
 
     # We've created variables and are unable to lift the initialization graphs,
     # so we fall back to initializing with conds while running the function.
-    canon_args, canon_kwds, flat_args, flat_kwds = \
+    canon_args, canon_kwds, _, filtered_flat_args = \
         self._stateful_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
             *args, **kwds)
-    return function_lib.defun(fn_with_cond)(canon_args, canon_kwds, flat_args,
-                                            flat_kwds)
+    return function_lib.defun(fn_with_cond)(canon_args, canon_kwds,
+                                            filtered_flat_args)
 
   @property
   def python_function(self):
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index f1e25c04fb2..75f015eca1e 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -218,6 +218,9 @@ class DefFunctionTest(xla_test.XLATestCase):
           y = f(x)
         return y, tape.gradient(y, x)
 
+      # Test that XLA context gets correctly propagated.
+      g._get_concrete_function_garbage_collected(2.0)(2.0)
+
       self.assertAllClose(40.0, f(2.0))
       self.assertAllClose([40.0, 28.0], g(2.0))
       self.assertAllClose(40.0, f.get_concrete_function(2.0)(2.0))
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index fea6c9963ff..35abd6ddbfe 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -1041,6 +1041,27 @@ class BatchTests(test.TestCase, parameterized.TestCase):
       z = x * y
     self.assertAllClose(acc.jvp(z), constant_op.constant([5.0, 2.0, 7.0]))
 
+  @parameterized.named_parameters([("ForwardPropFirst", True),
+                                   ("TapeFirst", False)])
+  def testBatchBackwardOverForward(self, forward_prop_first):
+    x = constant_op.constant(1.)
+    tangents = random_ops.random_normal(shape=[10], seed=1)
+    expected = [-t * math_ops.cos(1.) for t in tangents]
+    if forward_prop_first:
+      batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents)
+      gradient_tape = backprop.GradientTape(persistent=True)
+    else:
+      gradient_tape = backprop.GradientTape(persistent=True)
+      batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents)
+    with gradient_tape as tape:
+      with batch_acc as acc:
+        tape.watch(x)
+        y = math_ops.cos(x)
+        self.assertTrue(tape_lib.should_record_backprop((acc.jvp(y),)))
+        jvps = acc.jvp(y)
+      d2y_dx2 = [tape.gradient(dy_dx, x) for dy_dx in jvps]
+    self.assertAllClose(expected, d2y_dx2)
+
 
 if __name__ == "__main__":
   # TODO(allenl): Also test with 1.x-style graph mode.
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index bb4449a3357..2de288a338b 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -57,7 +57,6 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import default_gradient
 from tensorflow.python.ops import functional_ops
@@ -106,13 +105,13 @@ def _make_input_signature_hashable(elem):
   Returns:
     A hashable object for the requested input signature
   """
-  # TODO(slebedev): consider using nest.
-  if isinstance(elem, tuple):
-    return tuple(map(_make_input_signature_hashable, elem))
-
   try:
     hash(elem)
   except TypeError:
+    # TODO(slebedev): consider using nest.
+    if isinstance(elem, tuple):
+      return tuple(map(_make_input_signature_hashable, elem))
+
     # TFE_Py_EncodeArg weakrefs arguments it does not recognize, and we expect
     # all recognized types to be hashable.
     assert isinstance(elem, weakref.ReferenceType)
@@ -1747,12 +1746,15 @@ class ConcreteFunction(object):
       TypeError: if `args` and `kwargs` do not match the structured signature
         of this `ConcreteFunction`.
     """
-    args, kwargs, flat_args, flat_kwargs = \
+    args, kwargs, _, filtered_flat_args = \
         self._function_spec.canonicalize_function_inputs(*args, **kwargs)
     self._structured_signature_check_missing_args(args, kwargs)
     self._structured_signature_check_unexpected_args(args, kwargs)
     self._structured_signature_check_arg_types(args, kwargs)
-    return self._filtered_call(flat_args, flat_kwargs, cancellation_manager)
+    return self._call_flat(
+        filtered_flat_args,
+        captured_inputs=self.captured_inputs,
+        cancellation_manager=cancellation_manager)
 
   def _structured_signature_check_missing_args(self, args, kwargs):
     """Raises a TypeError if any args are missing."""
@@ -1834,38 +1836,14 @@ class ConcreteFunction(object):
                             type(spec_piece).__name__, spec_piece, name,
                             type(arg_piece).__name__, arg_piece))
 
-  def _filtered_call(self, flat_args, flat_kwargs, cancellation_manager=None):
-    """Executes the function, filtering arguments from the Python function.
-
-    Objects aside from Tensors, CompositeTensors, and Variables are ignored.
-    CompositeTensors have been expanded into their components on input.
-
-    Args:
-      flat_args: Flattened canonicalized positional arguments of the Python
-        function.
-      flat_kwargs: Flattened canonicalized keyword arguments of the Python
-        function.
-      cancellation_manager: (Optional.) A `CancellationManager` that can be
-        used to cancel function invocation.
-
-    Returns:
-      The result of applying the function on the Tensors/Variables contained in
-      `flat_args` and `flat_kwargs`.
-    """
-    return self._call_flat([
-        t for t in flat_args + flat_kwargs
-        if isinstance(t, (ops.Tensor,
-                          resource_variable_ops.BaseResourceVariable))
-    ],
-                           captured_inputs=self.captured_inputs,
-                           cancellation_manager=cancellation_manager)
-
   def _call_flat(self, args, captured_inputs, cancellation_manager=None):
     """Executes the wrapped function.
 
     Args:
-      args: a list of Tensors or Variables.  Any CompositeTensors should be
-        expanded before calling this method.
+      args: a list of Tensors or Variables. Arguments from the Python function
+        should be filtered before calling this method: objects aside from
+        Tensors, CompositeTensors, and Variables are ignored. Any
+        CompositeTensors should be expanded before calling this method.
       captured_inputs: the captured inputs that are also part of the input args
         to the actual execution. By default, it should be self._captured_inputs.
       cancellation_manager: (Optional.) A `CancellationManager` that can be
@@ -1940,24 +1918,14 @@ class ConcreteFunction(object):
         possible_gradient_type,
         executing_eagerly)
     forward_function, args_with_tangents = forward_backward.forward()
-    compiled_with_xla = self._attrs.get("_XlaMustCompile", False) and \
-        not control_flow_util.GraphOrParentsInXlaContext(default_graph)
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    try:
-      if compiled_with_xla:
-        xla_context.Enter()
-      if executing_eagerly:
-        flat_outputs = forward_function.call(
-            ctx, args_with_tangents,
-            cancellation_manager=cancellation_manager)
-      else:
-        with default_graph._override_gradient_function(  # pylint: disable=protected-access
-            {"PartitionedCall": self._get_gradient_function(),
-             "StatefulPartitionedCall": self._get_gradient_function()}):
-          flat_outputs = forward_function.call(ctx, args_with_tangents)
-    finally:
-      if compiled_with_xla:
-        xla_context.Exit()
+    if executing_eagerly:
+      flat_outputs = forward_function.call(
+          ctx, args_with_tangents, cancellation_manager=cancellation_manager)
+    else:
+      with default_graph._override_gradient_function(  # pylint: disable=protected-access
+          {"PartitionedCall": self._get_gradient_function(),
+           "StatefulPartitionedCall": self._get_gradient_function()}):
+        flat_outputs = forward_function.call(ctx, args_with_tangents)
     forward_backward.record(flat_outputs)
     return self._build_call_outputs(flat_outputs)
 
@@ -2618,11 +2586,12 @@ class FunctionSpec(object):
       **kwargs: The keyword args this function was called with.
 
     Returns:
-      A canonicalized ordering of the inputs representened by a tuple in the
-      form (args, kwargs), followed by their flattened versions in the form
-      (flat_args, flat_kwargs). Here: `args` is a full list of bound arguments,
-      and `kwargs` contains only true keyword arguments, as opposed to named
-      arguments called in a keyword-like fashion.
+      A canonicalized ordering of the inputs, as well as full and filtered
+      (Tensors and Variables only) versions of their concatenated flattened
+      representations, represented by a tuple in the form (args, kwargs,
+      flat_args, filtered_flat_args). Here: `args` is a full list of bound
+      arguments, and `kwargs` contains only true keyword arguments, as opposed
+      to named arguments called in a keyword-like fashion.
 
     Raises:
       ValueError: If a keyword in `kwargs` cannot be matched with a positional
@@ -2702,14 +2671,15 @@ class FunctionSpec(object):
           kwargs.setdefault(kwarg, default)
 
     if self._input_signature is None:
-      inputs, flat_inputs = _convert_numpy_inputs(inputs)
-      kwargs, flat_kwargs = _convert_numpy_inputs(kwargs)
-      return inputs, kwargs, flat_inputs, flat_kwargs
+      inputs, flat_inputs, filtered_flat_inputs = _convert_numpy_inputs(inputs)
+      kwargs, flat_kwargs, filtered_flat_kwargs = _convert_numpy_inputs(kwargs)
+      return (inputs, kwargs, flat_inputs + flat_kwargs,
+              filtered_flat_inputs + filtered_flat_kwargs)
     else:
       assert not kwargs
-      inputs, flat_inputs = _convert_inputs_to_signature(
+      inputs, flat_inputs, filtered_flat_inputs = _convert_inputs_to_signature(
           inputs, self._input_signature, self._flat_input_signature)
-      return inputs, {}, flat_inputs, []
+      return inputs, {}, flat_inputs, filtered_flat_inputs
 
 
 def _as_ndarray(value):
@@ -2739,7 +2709,7 @@ def _convert_numpy_inputs(inputs):
   # We assume that any CompositeTensors have already converted their components
   # from numpy arrays to Tensors, so we don't need to expand composites here for
   # the numpy array conversion. Instead, we do so because the flattened inputs
-  # are eventually passed to ConcreteFunction()._filtered_call, which requires
+  # are eventually passed to ConcreteFunction()._call_flat, which requires
   # expanded composites.
   flat_inputs = nest.flatten(inputs, expand_composites=True)
 
@@ -2748,20 +2718,28 @@ def _convert_numpy_inputs(inputs):
   # finding a way to store them directly in the cache key (currently not
   # possible since ndarrays are not hashable).
   need_packing = False
+  filtered_flat_inputs = []
   for index, value in enumerate(flat_inputs):
-    if _is_ndarray(value):
+    if isinstance(value,
+                  (ops.Tensor, resource_variable_ops.BaseResourceVariable)):
+      filtered_flat_inputs.append(value)
+    elif hasattr(value, "__array__") and not (
+        hasattr(value, "_should_act_as_resource_variable") or
+        isinstance(value, (np.str_, type, composite_tensor.CompositeTensor))):
+      # This case is equivalent to _is_ndarray(value) == True
       a = _as_ndarray(value)
       if not isinstance(a, np.ndarray):
         raise TypeError("The output of __array__ must be an np.ndarray "
                         "(got {} from {}).".format(type(a), type(value)))
       flat_inputs[index] = constant_op.constant(a)
+      filtered_flat_inputs.append(flat_inputs[index])
       need_packing = True
   if need_packing:
     return (nest.pack_sequence_as(
         structure=inputs, flat_sequence=flat_inputs,
-        expand_composites=True), flat_inputs)
+        expand_composites=True), flat_inputs, filtered_flat_inputs)
   else:
-    return inputs, flat_inputs
+    return inputs, flat_inputs, filtered_flat_inputs
 
 
 def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
@@ -2810,7 +2788,12 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
         flat_sequence=flatten_inputs,
         expand_composites=True)
 
-  return inputs, nest.flatten(inputs, expand_composites=True)
+  flat_inputs = nest.flatten(inputs, expand_composites=True)
+
+  return (inputs, flat_inputs, [
+      t for t in flat_inputs
+      if isinstance(t, (ops.Tensor, resource_variable_ops.BaseResourceVariable))
+  ])
 
 
 class FunctionCache(object):
@@ -2935,9 +2918,10 @@ class Function(object):
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
     with self._lock:
-      graph_function, flat_args, flat_kwargs = \
-          self._maybe_define_function(args, kwargs)
-    return graph_function._filtered_call(flat_args, flat_kwargs)  # pylint: disable=protected-access
+      (graph_function,
+       filtered_flat_args) = self._maybe_define_function(args, kwargs)
+    return graph_function._call_flat(
+        filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access
 
   @property
   def python_function(self):
@@ -2963,7 +2947,7 @@ class Function(object):
     if self.input_signature:
       args, kwargs = None, None
     with self._lock:
-      graph_function, _, _ = self._maybe_define_function(args, kwargs)
+      graph_function, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
   def _get_concrete_function_internal(self, *args, **kwargs):
@@ -3013,7 +2997,7 @@ class Function(object):
                            (str(args), str(self.input_signature)))
       args, kwargs = None, None
     with self._lock:
-      graph_function, _, _ = self._maybe_define_function(args, kwargs)
+      graph_function, _ = self._maybe_define_function(args, kwargs)
       seen_names = set()
       captured = object_identity.ObjectIdentitySet(
           graph_function.graph.internal_captures)
@@ -3207,12 +3191,12 @@ class Function(object):
     return graph_function
 
   def _define_function_with_shape_relaxation(self, args, kwargs, flat_args,
-                                             flat_kwargs):
+                                             filtered_flat_args):
     """Define a function, relaxing arg shapes to avoid unnecessary retracing."""
-    flat_args_all = nest.flatten((args, kwargs), expand_composites=False)
+    flat_no_comp = nest.flatten((args, kwargs), expand_composites=False)
 
     any_composite_args = any(
-        isinstance(x, composite_tensor.CompositeTensor) for x in flat_args_all)
+        isinstance(x, composite_tensor.CompositeTensor) for x in flat_no_comp)
 
     # Build a cache key where TensorShapes include only rank information (and
     # not information about the size of each dimension).
@@ -3227,7 +3211,7 @@ class Function(object):
       rank_only_cache_key = self._cache_key(
           cache_key_args, cache_key_kwargs, include_tensor_ranks_only=True)
 
-    arg_specs = [_type_spec_for(x) for x in flat_args_all]
+    arg_specs = [_type_spec_for(x) for x in flat_no_comp]
     relaxed_arg_specs = self._function_cache.arg_relaxed_specs.get(
         rank_only_cache_key, None)
     relaxed_arg_function = self._function_cache.arg_relaxed.get(
@@ -3236,7 +3220,7 @@ class Function(object):
     if (relaxed_arg_function is not None
         and all(_is_type_subset(x, y) for (x, y) in
                 zip(relaxed_arg_specs, arg_specs))):
-      return relaxed_arg_function, flat_args, flat_kwargs
+      return relaxed_arg_function, filtered_flat_args
 
     if relaxed_arg_specs is None:
       relaxed_arg_specs = arg_specs
@@ -3262,15 +3246,18 @@ class Function(object):
           (args, kwargs), relaxed_arg_specs, expand_composites=False)
       (args, kwargs) = nest.pack_sequence_as(
           (relaxed_arg_specs, relaxed_kwarg_specs),
-          flat_args + flat_kwargs,
+          flat_args,
           expand_composites=True)
 
     graph_function = self._create_graph_function(
         args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
     self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
 
-    return (graph_function, nest.flatten(args, expand_composites=True),
-            nest.flatten(kwargs, expand_composites=True))
+    return (graph_function, [
+        t for t in nest.flatten((args, kwargs), expand_composites=True)
+        if isinstance(t, (ops.Tensor,
+                          resource_variable_ops.BaseResourceVariable))
+    ])
 
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
@@ -3286,7 +3273,8 @@ class Function(object):
 
     Returns:
       A graph function corresponding to the input signature implied by args and
-      kwargs, as well as flattened inputs that the object should be called with.
+      kwargs, as well as filtered flattened inputs (only Tensors and Variables)
+      that the object should be called with.
 
     Raises:
       ValueError: If inputs are incompatible with the input signature.
@@ -3295,10 +3283,10 @@ class Function(object):
         shape relaxation retracing.
     """
     if self.input_signature is None or args is not None or kwargs is not None:
-      args, kwargs, flat_args, flat_kwargs = \
+      args, kwargs, flat_args, filtered_flat_args = \
           self._function_spec.canonicalize_function_inputs(*args, **kwargs)
     else:
-      flat_args, flat_kwargs = [None], [None]
+      flat_args, filtered_flat_args = [None], []
 
     cache_key = self._cache_key(args, kwargs)
 
@@ -3311,7 +3299,7 @@ class Function(object):
 
     graph_function = self._function_cache.primary.get(cache_key, None)
     if graph_function is not None:
-      return graph_function, flat_args, flat_kwargs
+      return graph_function, filtered_flat_args
 
     logging.vlog(1,
                  "Creating new FuncGraph for Python function %r (key: %r)",
@@ -3338,7 +3326,7 @@ class Function(object):
           and self.input_signature is None
           and call_context_key in self._function_cache.missed):
         return self._define_function_with_shape_relaxation(
-            args, kwargs, flat_args, flat_kwargs)
+            args, kwargs, flat_args, filtered_flat_args)
 
       self._function_cache.missed.add(call_context_key)
       graph_function = self._create_graph_function(args, kwargs)
@@ -3347,7 +3335,7 @@ class Function(object):
       if ops.get_default_graph()._distribution_strategy_stack:
         self._traced_with_distribution_strategy = True
 
-      return graph_function, flat_args, flat_kwargs
+      return graph_function, filtered_flat_args
 
 
 def register(func, *args, **kwargs):
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
index 4e6a60e0d27..c643bce6f56 100644
--- a/tensorflow/python/eager/function_argument_naming_test.py
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -217,9 +218,9 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
         [inp.op.name for inp in variadic_op.inputs])
     self.assertEqual(
         [b'x', b'y', b'args_1', b'second_variadic', b'z', b'cust'],
-        [inp.op.get_attr('_user_specified_name')
-         for inp in variadic_op.inputs])
+        [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs])
 
+  @test_util.disable_tfrt('GPU to host copy not implemented yet.')
   def testVariadicInputSignature(self, function_decorator):
     @function_decorator(
         input_signature=(
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 83523f321bd..95c514b7518 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 349> a = {{
+  static std::array<OpIndexInfo, 351> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -160,6 +160,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"Inv"},
       {"Invert"},
       {"InvertPermutation"},
+      {"IsotonicRegression"},
       {"LMDBReader"},
       {"LeakyReluGrad", 1, {0}},
       {"LeftShift"},
@@ -381,6 +382,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"TensorScatterAdd", 2, {0, 2}},
       {"TensorScatterSub", 2, {0, 2}},
       {"TensorScatterUpdate", 1, {0}},
+      {"TensorStridedSliceUpdate", 2, {0, 4}},
       {"TensorSummary"},
       {"TensorSummaryV2"},
       {"TextLineReader"},
@@ -412,7 +414,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 465> a = {{
+  static std::array<OpIndexInfo, 467> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -576,6 +578,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"InvGrad"},
       {"Invert"},
       {"InvertPermutation"},
+      {"IsotonicRegression", 1, {0}},
       {"L2Loss"},
       {"LMDBReader"},
       {"LeakyRelu"},
@@ -851,6 +854,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"TensorScatterAdd"},
       {"TensorScatterSub"},
       {"TensorScatterUpdate"},
+      {"TensorStridedSliceUpdate"},
       {"TensorSummary"},
       {"TensorSummaryV2"},
       {"TextLineReader"},
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 710e7bf5f9d..429068149b1 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import random
+import time
 
 from absl.testing import parameterized
 import numpy as np
@@ -26,6 +28,7 @@ import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
@@ -38,6 +41,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -88,7 +92,6 @@ class SingleWorkerTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(with_variable(constant_op.constant([2])).numpy(), [3])
 
-  @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionRemoteOutput(self):
     with ops.device('/job:worker/replica:0/task:0/cpu:0'):
       variable_b = variables.Variable(1)
@@ -97,10 +100,15 @@ class SingleWorkerTest(test.TestCase, parameterized.TestCase):
     def remote_output(i):
       with ops.device('/job:worker/replica:0/task:0/cpu:0'):
         c = variable_b + 1
-      return c, i + variable_b
+      return i + variable_b, c
 
-    self.assertAllEqual(
-        remote_output(constant_op.constant([1]))[0].numpy(), 2)
+    rets = remote_output(constant_op.constant([1]))
+    self.assertEqual(rets[0].backing_device,
+                     '/job:localhost/replica:0/task:0/device:CPU:0')
+    self.assertEqual(rets[1].backing_device,
+                     '/job:worker/replica:0/task:0/device:CPU:0')
+    self.assertAllEqual(rets[0].numpy(), [2])
+    self.assertAllEqual(rets[1].numpy(), 2)
 
   def testMultiDeviceFunctionAmbiguousDevice(self):
 
@@ -311,6 +319,63 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
     with ops.device('/job:worker/replica:0/task:1'):
       self.assertAllEqual(local_func(x), [2, 1])
 
+  # Note that the following tests for remote function cancellation only works
+  # when non-streaming RPC. We need to disable streaming explicitly and restore
+  # this config to its initial value at the end of each test case.
+  def testCancelRemoteFunctionBeforeExecution(self):
+    remote_async_env_var = 'TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE'
+    default_streaming = os.environ.get(remote_async_env_var)
+    os.environ[remote_async_env_var] = str(False)
+
+    q = data_flow_ops.FIFOQueue(1, dtypes.int32)
+
+    @def_function.function
+    def f():
+      return q.dequeue()
+
+    c_mgr = cancellation.CancellationManager()
+    cancelable_func = c_mgr.get_cancelable_function(f.get_concrete_function())
+
+    c_mgr.start_cancel()
+    with self.assertRaises(errors.CancelledError):
+      with ops.device('/job:worker/replica:0/task:1'):
+        cancelable_func()
+
+    if default_streaming is None:
+      del os.environ[remote_async_env_var]
+    else:
+      os.environ[remote_async_env_var] = default_streaming
+
+  def testCancelRemoteFunctionDuringExecution(self):
+    remote_async_env_var = 'TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE'
+    default_streaming = os.environ.get(remote_async_env_var)
+    os.environ[remote_async_env_var] = str(False)
+
+    q = data_flow_ops.FIFOQueue(1, dtypes.int32)
+
+    @def_function.function
+    def f():
+      return q.dequeue()
+
+    c_mgr = cancellation.CancellationManager()
+    cancelable_func = c_mgr.get_cancelable_function(f.get_concrete_function())
+
+    def cancel_thread():
+      time.sleep(0.5)
+      c_mgr.start_cancel()
+
+    t = self.checkedThread(cancel_thread)
+    t.start()
+    with self.assertRaises(errors.CancelledError):
+      with ops.device('/job:worker/replica:0/task:1'):
+        cancelable_func()
+    t.join()
+
+    if default_streaming is None:
+      del os.environ[remote_async_env_var]
+    else:
+      os.environ[remote_async_env_var] = default_streaming
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnLocalDevice(self):
     with ops.device('/job:worker/replica:0/task:1'):
@@ -421,6 +486,25 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
       with ops.device('/job:worker/replica:0/task:0/device:GPU:0'):
         self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
+  def testMultiDeviceFunctionRemoteOutput(self):
+    with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+      variable_b = variables.Variable(1)
+
+    @def_function.function
+    def remote_output(i):
+      with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+        c = variable_b + 1
+      return i + variable_b, c
+
+    with ops.device('/job:worker/replica:0/task:0/cpu:0'):
+      rets = remote_output(constant_op.constant([1]))
+    self.assertEqual(rets[0].backing_device,
+                     '/job:worker/replica:0/task:0/device:CPU:0')
+    self.assertEqual(rets[1].backing_device,
+                     '/job:worker/replica:0/task:1/device:CPU:0')
+    self.assertAllEqual(rets[0].numpy(), [2])
+    self.assertAllEqual(rets[1].numpy(), 2)
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceWhileLoopOnRemoteDevice(self):
     with ops.device('/job:worker/replica:0/task:1'):
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index b7a4d65b412..e3db9936389 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -58,8 +58,8 @@ class CompositeTensor(object):
 
     Args:
       shape: A `tf.TensorShape` object.  The shape invariant for this
-        `CompositeTensor`, or `None` if a default shape invariant should be
-        used (based on the value of this `CompositeTensor`).
+        `CompositeTensor`, or `None` if a default shape invariant should be used
+        (based on the value of this `CompositeTensor`).
 
     Returns:
       A nested structure whose values are `tf.TensorShape` objects, specifying
@@ -68,8 +68,8 @@ class CompositeTensor(object):
     # New TypeSpec subclasses generally do not need to implement this --
     # this method is used for backwards compatibility.  Users of tf.while_loop
     # can specify a type by passing in TypeSpec instead.
-    raise NotImplementedError("%s._shape_invariant_to_type_spec"
-                              % type(self).__name__)
+    raise NotImplementedError("%s._shape_invariant_to_type_spec" %
+                              type(self).__name__)
 
   def _consumers(self):
     """Returns a list of `Operation`s that consume this `CompositeTensor`.
@@ -105,12 +105,13 @@ def replace_composites_with_components(structure):
     returns the same value as `nest.flatten(structure)`.
   """
   if isinstance(structure, CompositeTensor):
-    return replace_composites_with_components(structure._to_components())  # pylint: disable=protected-access
+    return replace_composites_with_components(
+        structure._type_spec._to_components(structure))  # pylint: disable=protected-access
   elif not nest.is_sequence(structure):
     return structure
   else:
-    return nest.map_structure(replace_composites_with_components, structure,
-                              expand_composites=False)
+    return nest.map_structure(
+        replace_composites_with_components, structure, expand_composites=False)
 
 
 # @TODO(edloper): Can we replace convert_to_tensor_or_xyz with just
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 4bf2ad791d7..ebfce25d6db 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -380,6 +380,8 @@ class TypeSpec(object):
   @staticmethod
   def __is_compatible(a, b):
     """Returns true if the given type serializations compatible."""
+    if isinstance(a, TypeSpec):
+      return a.is_compatible_with(b)
     if type(a) is not type(b):
       return False
     if isinstance(a, (list, tuple)):
@@ -388,7 +390,7 @@ class TypeSpec(object):
     if isinstance(a, dict):
       return (len(a) == len(b) and sorted(a.keys()) == sorted(b.keys()) and all(
           TypeSpec.__is_compatible(a[k], b[k]) for k in a.keys()))
-    if isinstance(a, (TypeSpec, tensor_shape.TensorShape, dtypes.DType)):
+    if isinstance(a, (tensor_shape.TensorShape, dtypes.DType)):
       return a.is_compatible_with(b)
     return a == b
 
diff --git a/tensorflow/python/framework/type_spec_test.py b/tensorflow/python/framework/type_spec_test.py
index 46e1ea32d72..bcffd43ee6a 100644
--- a/tensorflow/python/framework/type_spec_test.py
+++ b/tensorflow/python/framework/type_spec_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import googletest
 
 
@@ -67,7 +68,8 @@ class TwoTensorsSpec(type_spec.TypeSpec):
     return (value.x, value.y)
 
   def _from_components(self, components):
-    return TwoTensors(*components)
+    x, y = components
+    return TwoTensors(x, y, self.color)
 
   def _serialize(self):
     return (self.x_shape, self.x_dtype, self.y_shape, self.y_dtype, self.color)
@@ -82,6 +84,54 @@ type_spec.register_type_spec_from_value_converter(
     TwoTensors, TwoTensorsSpec.from_value)
 
 
+class TwoComposites(object):
+  """A simple value type to test TypeSpec.
+
+  Contains two composite tensorstensors (x, y) and a string (color).
+  """
+
+  def __init__(self, x, y, color="red"):
+    assert isinstance(color, str)
+    self.x = ops.convert_to_tensor_or_composite(x)
+    self.y = ops.convert_to_tensor_or_composite(y)
+    self.color = color
+
+
+class TwoCompositesSpec(type_spec.TypeSpec):
+  """A TypeSpec for the TwoTensors value type."""
+
+  def __init__(self, x_spec, y_spec, color="red"):
+    self.x_spec = x_spec
+    self.y_spec = y_spec
+    self.color = color
+
+  value_type = property(lambda self: TwoComposites)
+
+  @property
+  def _component_specs(self):
+    return (self.x_spec, self.y_spec)
+
+  def _to_components(self, value):
+    return (value.x, value.y)
+
+  def _from_components(self, components):
+    x, y = components
+    return TwoTensors(x, y, self.color)
+
+  def _serialize(self):
+    return (self.x_spec, self.y_spec, self.color)
+
+  @classmethod
+  def from_value(cls, value):
+    return cls(type_spec.type_spec_from_value(value.x),
+               type_spec.type_spec_from_value(value.y),
+               value.color)
+
+
+type_spec.register_type_spec_from_value_converter(
+    TwoComposites, TwoCompositesSpec.from_value)
+
+
 class TypeSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -283,5 +333,21 @@ class TypeSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     spec = type_spec.type_spec_from_value(value)
     self.assertEqual(spec, TwoTensorsSpec.from_value(value))
 
+  def testNestedRagged(self):
+    # Check that TwoCompositeSpecs are compatible if one has a nested
+    # RaggedTensorSpec w/ ragged_rank=0 and the other has a corresponding
+    # nested TensorSpec.
+    spec1 = TwoCompositesSpec(
+        ragged_tensor.RaggedTensorSpec([10], dtypes.int32, ragged_rank=0),
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    spec2 = TwoCompositesSpec(
+        tensor_spec.TensorSpec([10], dtypes.int32),
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    spec3 = TwoCompositesSpec(
+        tensor_spec.TensorSpec([12], dtypes.int32),
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    self.assertTrue(spec1.is_compatible_with(spec2))
+    self.assertFalse(spec1.is_compatible_with(spec3))
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 567ff8c000d..0066fcb9712 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -138,6 +139,11 @@ def _conv_pool(x):
   return h_pool2
 
 
+def _depthwise_conv2d(x, w):
+  """Returns a 2d depthwise convolution layer with full stride."""
+  return nn.depthwise_conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+
+
 def _simple_loop(x, functor):
   """Simple loop whose body is provided by the functor."""
   init = (constant_op.constant(0), x)
@@ -566,6 +572,42 @@ class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
     tol = 5e-3 if mode == 'mkl' else 1e-3
     self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(benbarsdell): This test has not been tried with MKL.
+  @parameterized.parameters(['cuda'])
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test does not pass with XLA')
+  def test_depthwise_conv2d(self, mode):
+    """Test grad ops with depthwise convolution2d graph."""
+    self._maybe_skip(mode)
+    cudnn_version_str = sysconfig.get_build_info().get('cudnn_version', '0.0')
+    cudnn_version = tuple([int(x) for x in cudnn_version_str.split('.')])
+    if cudnn_version < (8,):
+      # Depthwise conv2d ops are only enabled in auto_mixed_precision as of
+      # cuDNN v8.
+      self.skipTest('cuDNN version >= 8 required')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    f = _weight([3, 3, 1, 4])
+    y = _depthwise_conv2d(x, f)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x, f])
+    output = (y, g)
+
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'depthwise')
+    self._assert_output_f16(
+        mode, node_map,
+        'gradients/depthwise_grad/DepthwiseConv2dNativeBackpropInput')
+    self._assert_output_f16(
+        mode, node_map,
+        'gradients/depthwise_grad/DepthwiseConv2dNativeBackpropFilter')
+
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    tol = 2e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+
   @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
diff --git a/tensorflow/python/grappler/cluster_wrapper.cc b/tensorflow/python/grappler/cluster_wrapper.cc
index aa762cb1dd9..dee8e593307 100644
--- a/tensorflow/python/grappler/cluster_wrapper.cc
+++ b/tensorflow/python/grappler/cluster_wrapper.cc
@@ -99,7 +99,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
           std::vector<tensorflow::NamedDevice> named_devices;
           for (const auto& s : serialized_named_devices) {
             tensorflow::NamedDevice named_device;
-            if (!named_device.ParseFromString(s)) {
+            if (!named_device.ParseFromString(std::string(s))) {
               throw std::invalid_argument(
                   "The NamedDevice could not be parsed as a valid protocol "
                   "buffer");
@@ -241,7 +241,7 @@ PYBIND11_MODULE(_pywrap_tf_cluster, m) {
 
   m.def("TF_EstimatePerformance", [](const py::bytes& serialized_device) {
     tensorflow::NamedDevice device;
-    if (!device.ParseFromString(serialized_device)) {
+    if (!device.ParseFromString(std::string(serialized_device))) {
       throw std::invalid_argument(
           "The NamedDevice could not be parsed as a valid protocol buffer");
     }
diff --git a/tensorflow/python/grappler/cost_analyzer_wrapper.cc b/tensorflow/python/grappler/cost_analyzer_wrapper.cc
index ce557b02e8d..4e960bb9404 100644
--- a/tensorflow/python/grappler/cost_analyzer_wrapper.cc
+++ b/tensorflow/python/grappler/cost_analyzer_wrapper.cc
@@ -32,7 +32,7 @@ PYBIND11_MODULE(_pywrap_cost_analyzer, m) {
         [](const py::bytes& serialized_metagraph, bool per_node_report,
            bool verbose, tensorflow::grappler::Cluster* cluster) -> py::bytes {
           tensorflow::MetaGraphDef metagraph;
-          if (!metagraph.ParseFromString(serialized_metagraph)) {
+          if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
             return "The MetaGraphDef could not be parsed as a valid protocol "
                    "buffer";
           }
diff --git a/tensorflow/python/grappler/item_wrapper.cc b/tensorflow/python/grappler/item_wrapper.cc
index e55b468a6ba..3b29392dc05 100644
--- a/tensorflow/python/grappler/item_wrapper.cc
+++ b/tensorflow/python/grappler/item_wrapper.cc
@@ -129,7 +129,7 @@ PYBIND11_MODULE(_pywrap_tf_item, m) {
         [](const py::bytes& serialized_metagraph, bool ignore_colocation,
            bool ignore_user_placement) -> tensorflow::grappler::GrapplerItem* {
           tensorflow::MetaGraphDef metagraph;
-          if (!metagraph.ParseFromString(serialized_metagraph)) {
+          if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
             throw std::invalid_argument(
                 "The MetaGraphDef could not be parsed as a valid protocol "
                 "buffer");
diff --git a/tensorflow/python/grappler/model_analyzer_wrapper.cc b/tensorflow/python/grappler/model_analyzer_wrapper.cc
index 47d1ec89897..68740caf7bf 100644
--- a/tensorflow/python/grappler/model_analyzer_wrapper.cc
+++ b/tensorflow/python/grappler/model_analyzer_wrapper.cc
@@ -29,7 +29,7 @@ PYBIND11_MODULE(_pywrap_model_analyzer, m) {
         [](const py::bytes& serialized_metagraph, bool assume_valid_feeds,
            bool debug) -> py::bytes {
           tensorflow::MetaGraphDef metagraph;
-          if (!metagraph.ParseFromString(serialized_metagraph)) {
+          if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
             return "The MetaGraphDef could not be parsed as a valid protocol "
                    "buffer";
           }
diff --git a/tensorflow/python/grappler/tf_optimizer_wrapper.cc b/tensorflow/python/grappler/tf_optimizer_wrapper.cc
index 14336a08cf5..32446a61073 100644
--- a/tensorflow/python/grappler/tf_optimizer_wrapper.cc
+++ b/tensorflow/python/grappler/tf_optimizer_wrapper.cc
@@ -66,12 +66,13 @@ PYBIND11_MODULE(_pywrap_tf_optimizer, m) {
          const std::string& graph_id,
          bool strip_default_attributes) -> py::bytes {
         tensorflow::ConfigProto config_proto;
-        if (!config_proto.ParseFromString(serialized_config_proto)) {
+        if (!config_proto.ParseFromString(
+                std::string(serialized_config_proto))) {
           throw std::invalid_argument(
               "The ConfigProto could not be parsed as a valid protocol buffer");
         }
         tensorflow::MetaGraphDef metagraph;
-        if (!metagraph.ParseFromString(serialized_metagraph)) {
+        if (!metagraph.ParseFromString(std::string(serialized_metagraph))) {
           throw std::invalid_argument(
               "The MetaGraphDef could not be parsed as a valid protocol "
               "buffer");
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 3f057361cab..651acbfeac4 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -84,6 +84,7 @@ from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
 
 py_all = all
 py_sum = sum
@@ -163,6 +164,7 @@ set_image_data_format = backend_config.set_image_data_format
 
 
 @keras_export('keras.backend.backend')
+@doc_controls.do_not_generate_docs
 def backend():
   """Publicly accessible method for determining the current backend.
 
@@ -176,6 +178,7 @@ def backend():
 
 @keras_export('keras.backend.cast_to_floatx')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -310,6 +313,7 @@ def clear_session():
 
 
 @keras_export('keras.backend.manual_variable_initialization')
+@doc_controls.do_not_generate_docs
 def manual_variable_initialization(value):
   """Sets the manual variable initialization flag.
 
@@ -327,6 +331,7 @@ def manual_variable_initialization(value):
 
 
 @keras_export('keras.backend.learning_phase')
+@doc_controls.do_not_generate_docs
 def learning_phase():
   """Returns the learning phase flag.
 
@@ -395,6 +400,7 @@ def _default_learning_phase():
 
 
 @keras_export('keras.backend.set_learning_phase')
+@doc_controls.do_not_generate_docs
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
 
@@ -461,6 +467,7 @@ def deprecated_internal_set_learning_phase(value):
 
 @keras_export('keras.backend.learning_phase_scope')
 @tf_contextlib.contextmanager
+@doc_controls.do_not_generate_docs
 def learning_phase_scope(value):
   """Provides a scope within which the learning phase is equal to `value`.
 
@@ -837,6 +844,7 @@ def _to_tensor(x, dtype):
 
 
 @keras_export('keras.backend.is_sparse')
+@doc_controls.do_not_generate_docs
 def is_sparse(tensor):
   """Returns whether a tensor is a sparse tensor.
 
@@ -865,6 +873,7 @@ def is_sparse(tensor):
 
 @keras_export('keras.backend.to_dense')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -892,6 +901,7 @@ def to_dense(tensor):
 
 
 @keras_export('keras.backend.name_scope', v1=[])
+@doc_controls.do_not_generate_docs
 def name_scope(name):
   """A context manager for use when defining a Python op.
 
@@ -923,6 +933,7 @@ keras_export(v1=['keras.backend.name_scope'])(ops.name_scope_v1)
 
 
 @keras_export('keras.backend.variable')
+@doc_controls.do_not_generate_docs
 def variable(value, dtype=None, name=None, constraint=None):
   """Instantiates a variable and returns it.
 
@@ -1074,6 +1085,7 @@ def _initialize_variables(session):
 
 @keras_export('keras.backend.constant')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -1147,6 +1159,7 @@ def is_keras_tensor(x):
 
 
 @keras_export('keras.backend.placeholder')
+@doc_controls.do_not_generate_docs
 def placeholder(shape=None,
                 ndim=None,
                 dtype=None,
@@ -1265,6 +1278,7 @@ def is_placeholder(x):
 
 @keras_export('keras.backend.shape')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -1289,6 +1303,7 @@ def shape(x):
 
 
 @keras_export('keras.backend.int_shape')
+@doc_controls.do_not_generate_docs
 def int_shape(x):
   """Returns the shape of tensor or variable as a tuple of int or None entries.
 
@@ -1319,6 +1334,7 @@ def int_shape(x):
 
 
 @keras_export('keras.backend.ndim')
+@doc_controls.do_not_generate_docs
 def ndim(x):
   """Returns the number of axes in a tensor, as an integer.
 
@@ -1348,6 +1364,7 @@ def ndim(x):
 
 @keras_export('keras.backend.dtype')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1380,6 +1397,7 @@ def dtype(x):
 
 
 @keras_export('keras.backend.eval')
+@doc_controls.do_not_generate_docs
 def eval(x):
   """Evaluates the value of a variable.
 
@@ -1402,6 +1420,7 @@ def eval(x):
 
 
 @keras_export('keras.backend.zeros')
+@doc_controls.do_not_generate_docs
 def zeros(shape, dtype=None, name=None):
   """Instantiates an all-zeros variable and returns it.
 
@@ -1447,6 +1466,7 @@ def zeros(shape, dtype=None, name=None):
 
 @keras_export('keras.backend.ones')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1482,6 +1502,7 @@ def ones(shape, dtype=None, name=None):
 
 @keras_export('keras.backend.eye')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1511,6 +1532,7 @@ def eye(size, dtype=None, name=None):
 
 
 @keras_export('keras.backend.zeros_like')
+@doc_controls.do_not_generate_docs
 def zeros_like(x, dtype=None, name=None):
   """Instantiates an all-zeros variable of the same shape as another tensor.
 
@@ -1539,6 +1561,7 @@ def zeros_like(x, dtype=None, name=None):
 
 @keras_export('keras.backend.ones_like')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1577,6 +1600,7 @@ def identity(x, name=None):
 
 
 @keras_export('keras.backend.random_uniform_variable')
+@doc_controls.do_not_generate_docs
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """Instantiates a variable with values drawn from a uniform distribution.
 
@@ -1611,6 +1635,7 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
 
 
 @keras_export('keras.backend.random_normal_variable')
+@doc_controls.do_not_generate_docs
 def random_normal_variable(shape, mean, scale, dtype=None, name=None,
                            seed=None):
   """Instantiates a variable with values drawn from a normal distribution.
@@ -1646,6 +1671,7 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
 
 
 @keras_export('keras.backend.count_params')
+@doc_controls.do_not_generate_docs
 def count_params(x):
   """Returns the static number of elements in a variable or tensor.
 
@@ -1670,6 +1696,7 @@ def count_params(x):
 
 @keras_export('keras.backend.cast')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1701,11 +1728,13 @@ def cast(x, dtype):
 
 
 @keras_export('keras.backend.update')
+@doc_controls.do_not_generate_docs
 def update(x, new_x):
   return state_ops.assign(x, new_x)
 
 
 @keras_export('keras.backend.update_add')
+@doc_controls.do_not_generate_docs
 def update_add(x, increment):
   """Update the value of `x` by adding `increment`.
 
@@ -1720,6 +1749,7 @@ def update_add(x, increment):
 
 
 @keras_export('keras.backend.update_sub')
+@doc_controls.do_not_generate_docs
 def update_sub(x, decrement):
   """Update the value of `x` by subtracting `decrement`.
 
@@ -1734,6 +1764,7 @@ def update_sub(x, decrement):
 
 
 @keras_export('keras.backend.moving_average_update')
+@doc_controls.do_not_generate_docs
 def moving_average_update(x, value, momentum):
   """Compute the exponential moving average of a value.
 
@@ -1781,6 +1812,7 @@ def moving_average_update(x, value, momentum):
 
 @keras_export('keras.backend.dot')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
@@ -1842,6 +1874,7 @@ def dot(x, y):
 
 @keras_export('keras.backend.batch_dot')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -2031,6 +2064,7 @@ def batch_dot(x, y, axes=None):
 
 @keras_export('keras.backend.transpose')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -2063,6 +2097,7 @@ def transpose(x):
 
 @keras_export('keras.backend.gather')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -2099,6 +2134,7 @@ def gather(reference, indices):
 
 @keras_export('keras.backend.max')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -2118,6 +2154,7 @@ def max(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.min')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -2137,6 +2174,7 @@ def min(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.sum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -2156,6 +2194,7 @@ def sum(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.prod')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -2175,6 +2214,7 @@ def prod(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.cumsum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -2190,6 +2230,7 @@ def cumsum(x, axis=0):
 
 @keras_export('keras.backend.cumprod')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2204,6 +2245,7 @@ def cumprod(x, axis=0):
 
 
 @keras_export('keras.backend.var')
+@doc_controls.do_not_generate_docs
 def var(x, axis=None, keepdims=False):
   """Variance of a tensor, alongside the specified axis.
 
@@ -2225,6 +2267,7 @@ def var(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.std')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -2252,6 +2295,7 @@ def std(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.mean')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -2273,6 +2317,7 @@ def mean(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.any')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -2290,6 +2335,7 @@ def any(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.all')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -2307,6 +2353,7 @@ def all(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.argmax')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -2322,6 +2369,7 @@ def argmax(x, axis=-1):
 
 @keras_export('keras.backend.argmin')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -2337,6 +2385,7 @@ def argmin(x, axis=-1):
 
 @keras_export('keras.backend.square')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def square(x):
   """Element-wise square.
 
@@ -2351,6 +2400,7 @@ def square(x):
 
 @keras_export('keras.backend.abs')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def abs(x):
   """Element-wise absolute value.
 
@@ -2365,6 +2415,7 @@ def abs(x):
 
 @keras_export('keras.backend.sqrt')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sqrt(x):
   """Element-wise square root.
 
@@ -2382,6 +2433,7 @@ def sqrt(x):
 
 @keras_export('keras.backend.exp')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def exp(x):
   """Element-wise exponential.
 
@@ -2396,6 +2448,7 @@ def exp(x):
 
 @keras_export('keras.backend.log')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def log(x):
   """Element-wise log.
 
@@ -2431,6 +2484,7 @@ def logsumexp(x, axis=None, keepdims=False):
 
 @keras_export('keras.backend.round')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -2447,6 +2501,7 @@ def round(x):
 
 @keras_export('keras.backend.sign')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sign(x):
   """Element-wise sign.
 
@@ -2461,6 +2516,7 @@ def sign(x):
 
 @keras_export('keras.backend.pow')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -2476,6 +2532,7 @@ def pow(x, a):
 
 @keras_export('keras.backend.clip')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -2500,6 +2557,7 @@ def clip(x, min_value, max_value):
 
 @keras_export('keras.backend.equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -2515,6 +2573,7 @@ def equal(x, y):
 
 @keras_export('keras.backend.not_equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2530,6 +2589,7 @@ def not_equal(x, y):
 
 @keras_export('keras.backend.greater')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2545,6 +2605,7 @@ def greater(x, y):
 
 @keras_export('keras.backend.greater_equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2560,6 +2621,7 @@ def greater_equal(x, y):
 
 @keras_export('keras.backend.less')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2575,6 +2637,7 @@ def less(x, y):
 
 @keras_export('keras.backend.less_equal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2590,6 +2653,7 @@ def less_equal(x, y):
 
 @keras_export('keras.backend.maximum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2615,6 +2679,7 @@ def maximum(x, y):
 
 @keras_export('keras.backend.minimum')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2630,6 +2695,7 @@ def minimum(x, y):
 
 @keras_export('keras.backend.sin')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2644,6 +2710,7 @@ def sin(x):
 
 @keras_export('keras.backend.cos')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2759,6 +2826,7 @@ def _fused_normalize_batch_in_training(x,
 
 
 @keras_export('keras.backend.normalize_batch_in_training')
+@doc_controls.do_not_generate_docs
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -2790,6 +2858,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 @keras_export('keras.backend.batch_normalization')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2853,6 +2922,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 @keras_export('keras.backend.concatenate')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2891,6 +2961,7 @@ def concatenate(tensors, axis=-1):
 
 @keras_export('keras.backend.reshape')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2921,6 +2992,7 @@ def reshape(x, shape):
 
 @keras_export('keras.backend.permute_dimensions')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2953,6 +3025,7 @@ def permute_dimensions(x, pattern):
 
 @keras_export('keras.backend.resize_images')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -3017,6 +3090,7 @@ def resize_images(x, height_factor, width_factor, data_format,
 
 @keras_export('keras.backend.resize_volumes')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -3050,6 +3124,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
 
 @keras_export('keras.backend.repeat_elements')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -3112,6 +3187,7 @@ def repeat_elements(x, rep, axis):
 
 @keras_export('keras.backend.repeat')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -3148,6 +3224,7 @@ def repeat(x, n):
 
 @keras_export('keras.backend.arange')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -3187,6 +3264,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
 @keras_export('keras.backend.tile')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3205,6 +3283,7 @@ def tile(x, n):
 
 @keras_export('keras.backend.flatten')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def flatten(x):
   """Flatten a tensor.
 
@@ -3231,6 +3310,7 @@ def flatten(x):
 
 @keras_export('keras.backend.batch_flatten')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -3257,6 +3337,7 @@ def batch_flatten(x):
 
 @keras_export('keras.backend.expand_dims')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -3272,6 +3353,7 @@ def expand_dims(x, axis=-1):
 
 @keras_export('keras.backend.squeeze')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -3287,6 +3369,7 @@ def squeeze(x, axis):
 
 @keras_export('keras.backend.temporal_padding')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -3305,6 +3388,7 @@ def temporal_padding(x, padding=(1, 1)):
 
 @keras_export('keras.backend.spatial_2d_padding')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -3337,6 +3421,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
 
 @keras_export('keras.backend.spatial_3d_padding')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -3382,6 +3467,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
 
 @keras_export('keras.backend.stack')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -3409,6 +3495,7 @@ def stack(x, axis=0):
 
 @keras_export('keras.backend.one_hot')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -3429,6 +3516,7 @@ def one_hot(indices, num_classes):
 
 @keras_export('keras.backend.reverse')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -3475,6 +3563,7 @@ _VALUE_SET_CODE_STRING = """
 
 
 @keras_export('keras.backend.get_value')
+@doc_controls.do_not_generate_docs
 def get_value(x):
   """Returns the value of a variable.
 
@@ -3510,6 +3599,7 @@ def get_value(x):
 
 @keras_export('keras.backend.batch_get_value')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -3533,6 +3623,7 @@ def batch_get_value(tensors):
 
 
 @keras_export('keras.backend.set_value')
+@doc_controls.do_not_generate_docs
 def set_value(x, value):
   """Sets the value of a variable, from a Numpy array.
 
@@ -3572,6 +3663,7 @@ def set_value(x, value):
 
 @keras_export('keras.backend.batch_set_value')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -3615,6 +3707,7 @@ set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 @keras_export('keras.backend.print_tensor')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -3916,6 +4009,7 @@ def eval_in_eager_or_function(outputs):
 
 
 @keras_export('keras.backend.function')
+@doc_controls.do_not_generate_docs
 def function(inputs, outputs, updates=None, name=None, **kwargs):
   """Instantiates a Keras function.
 
@@ -3963,6 +4057,7 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
 
 
 @keras_export('keras.backend.gradients')
+@doc_controls.do_not_generate_docs
 def gradients(loss, variables):
   """Returns the gradients of `loss` w.r.t. `variables`.
 
@@ -3979,6 +4074,7 @@ def gradients(loss, variables):
 
 @keras_export('keras.backend.stop_gradient')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -4396,6 +4492,7 @@ def rnn(step_function,
 
 @keras_export('keras.backend.switch')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -4460,6 +4557,7 @@ def switch(condition, then_expression, else_expression):
 
 
 @keras_export('keras.backend.in_train_phase')
+@doc_controls.do_not_generate_docs
 def in_train_phase(x, alt, training=None):
   """Selects `x` in train phase, and `alt` otherwise.
 
@@ -4505,6 +4603,7 @@ def in_train_phase(x, alt, training=None):
 
 
 @keras_export('keras.backend.in_test_phase')
+@doc_controls.do_not_generate_docs
 def in_test_phase(x, alt, training=None):
   """Selects `x` in test phase, and `alt` otherwise.
 
@@ -4530,6 +4629,7 @@ def in_test_phase(x, alt, training=None):
 
 @keras_export('keras.backend.relu')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -4587,6 +4687,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 @keras_export('keras.backend.elu')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -4606,6 +4707,7 @@ def elu(x, alpha=1.):
 
 @keras_export('keras.backend.softmax')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -4622,6 +4724,7 @@ def softmax(x, axis=-1):
 
 @keras_export('keras.backend.softplus')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def softplus(x):
   """Softplus of a tensor.
 
@@ -4636,6 +4739,7 @@ def softplus(x):
 
 @keras_export('keras.backend.softsign')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def softsign(x):
   """Softsign of a tensor.
 
@@ -4650,6 +4754,7 @@ def softsign(x):
 
 @keras_export('keras.backend.categorical_crossentropy')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -4721,6 +4826,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 @keras_export('keras.backend.sparse_categorical_crossentropy')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -4805,6 +4911,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 @keras_export('keras.backend.binary_crossentropy')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -4844,6 +4951,7 @@ def binary_crossentropy(target, output, from_logits=False):
 
 @keras_export('keras.backend.sigmoid')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -4858,6 +4966,7 @@ def sigmoid(x):
 
 @keras_export('keras.backend.hard_sigmoid')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -4881,6 +4990,7 @@ def hard_sigmoid(x):
 
 @keras_export('keras.backend.tanh')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def tanh(x):
   """Element-wise tanh.
 
@@ -4895,6 +5005,7 @@ def tanh(x):
 
 @keras_export('keras.backend.dropout')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4916,6 +5027,7 @@ def dropout(x, level, noise_shape=None, seed=None):
 
 @keras_export('keras.backend.l2_normalize')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4931,6 +5043,7 @@ def l2_normalize(x, axis=None):
 
 @keras_export('keras.backend.in_top_k')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -5034,6 +5147,7 @@ def _preprocess_padding(padding):
 
 @keras_export('keras.backend.conv1d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv1d(x,
            kernel,
            strides=1,
@@ -5085,6 +5199,7 @@ def conv1d(x,
 
 @keras_export('keras.backend.conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -5129,6 +5244,7 @@ def conv2d(x,
 
 @keras_export('keras.backend.conv2d_transpose')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -5270,6 +5386,7 @@ def separable_conv1d(x,
 
 @keras_export('keras.backend.separable_conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -5328,6 +5445,7 @@ def separable_conv2d(x,
 
 @keras_export('keras.backend.depthwise_conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def depthwise_conv2d(x,
                      depthwise_kernel,
                      strides=(1, 1),
@@ -5378,6 +5496,7 @@ def depthwise_conv2d(x,
 
 @keras_export('keras.backend.conv3d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -5481,6 +5600,7 @@ def conv3d_transpose(x,
 
 @keras_export('keras.backend.pool2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -5541,6 +5661,7 @@ def pool2d(x,
 
 @keras_export('keras.backend.pool3d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -5672,6 +5793,7 @@ def local_conv(inputs,
 
 @keras_export('keras.backend.local_conv1d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -5708,6 +5830,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 @keras_export('keras.backend.local_conv2d')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -5750,6 +5873,7 @@ def local_conv2d(inputs,
 
 @keras_export('keras.backend.bias_add')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -5795,6 +5919,7 @@ def bias_add(x, bias, data_format=None):
 
 @keras_export('keras.backend.random_normal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -5832,6 +5957,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 @keras_export('keras.backend.random_uniform')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -5865,6 +5991,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
 
 @keras_export('keras.backend.random_binomial')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -5893,17 +6020,12 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """
   logging.warning('`tf.keras.backend.random_binomial` is deprecated. '
                   'Please use `tf.keras.backend.random_bernoulli` instead.')
-  if dtype is None:
-    dtype = floatx()
-  if seed is None:
-    seed = np.random.randint(10e6)
-  return array_ops.where_v2(
-      random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
-      array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
+  return random_bernoulli(shape, p, dtype, seed)
 
 
 @keras_export('keras.backend.random_bernoulli')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random bernoulli distribution of values.
 
@@ -5916,11 +6038,18 @@ def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   Returns:
       A tensor.
   """
-  return random_binomial(shape, p, dtype, seed)
+  if dtype is None:
+    dtype = floatx()
+  if seed is None:
+    seed = np.random.randint(10e6)
+  return array_ops.where_v2(
+      random_ops.random_uniform(shape, dtype=dtype, seed=seed) <= p,
+      array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
 
 @keras_export('keras.backend.truncated_normal')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5956,6 +6085,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 @keras_export('keras.backend.ctc_label_dense_to_sparse')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -6003,6 +6133,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
 
 @keras_export('keras.backend.ctc_batch_cost')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -6036,6 +6167,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
 
 @keras_export('keras.backend.ctc_decode')
 @dispatch.add_dispatch_support
+@doc_controls.do_not_generate_docs
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
@@ -6092,6 +6224,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
 
 
 @keras_export('keras.backend.map_fn')
+@doc_controls.do_not_generate_docs
 def map_fn(fn, elems, name=None, dtype=None):
   """Map the function fn over the elements elems and return the outputs.
 
@@ -6108,6 +6241,7 @@ def map_fn(fn, elems, name=None, dtype=None):
 
 
 @keras_export('keras.backend.foldl')
+@doc_controls.do_not_generate_docs
 def foldl(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from left to right.
 
@@ -6125,6 +6259,7 @@ def foldl(fn, elems, initializer=None, name=None):
 
 
 @keras_export('keras.backend.foldr')
+@doc_controls.do_not_generate_docs
 def foldr(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from right to left.
 
diff --git a/tensorflow/python/keras/benchmarks/BUILD b/tensorflow/python/keras/benchmarks/BUILD
index 2252f888780..95e88ca7a9d 100644
--- a/tensorflow/python/keras/benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -67,6 +67,7 @@ cuda_py_test(
         "no_oss_py38",  # TODO(b/162044699)
     ],
     deps = [
+        ":profiler_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
@@ -76,6 +77,7 @@ cuda_py_test(
     srcs = ["model_components_benchmarks_test.py"],
     python_version = "PY3",
     deps = [
+        ":profiler_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
index d26d9495019..002d65842ab 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
@@ -84,56 +84,56 @@ Metrics for following benchmarks:</br>
 
 #### Cifar10 CNN benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 256        | 1393.4896 | 3.21           | 15397.69    | `off`
-GPU:2 | 256        | 76.49     | 2.59           | 18758.01    | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 256 | 1393.4896 | 3.21 |
+15397.69 | `off` | | GPU:2 | 256 | 76.49 | 2.59 | 18758.01 | `mirrored` |
 
 #### MNIST Conv benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 256        | 196.52    | 12.19          | 4915.26     | `off`
-GPU:2 | 256        | 24.5794   | 1.21           | 47899.32    | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 256 | 196.52 | 12.19 | 4915.26 |
+`off` | | GPU:2 | 256 | 24.5794 | 1.21 | 47899.32 | `mirrored` |
 
 #### MNIST Hierarchical RNN (HRNN) benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 256        | 654.05    | 218.68         | 274.24      | `off`
-GPU:2 | 256        | 20.77     | 3.73           | 15088.06    | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 256 | 654.05 | 218.68 | 274.24 |
+`off` | | GPU:2 | 256 | 20.77 | 3.73 | 15088.06 | `mirrored` |
 
 #### Bidirectional LSTM benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 512        | 225.57    | 72.55          | 344.70      | `off`
-GPU:2 | 512        | 23.54     | 3.23           | 7532.53     | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 512 | 225.57 | 72.55 | 344.70 |
+`off` | | GPU:2 | 512 | 23.54 | 3.23 | 7532.53 | `mirrored` |
 
 #### Text classification with transformer benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 512        | 109.22    | 35.93          | 698.10      | `off`
-GPU:2 | 512        | 9.28      | 0.83           | 26567.54    | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 512 | 109.22 | 35.93 | 698.10 |
+`off` | | GPU:2 | 512 | 9.28 | 0.83 | 26567.54 | `mirrored` |
 
 #### MLP benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 128        | 3.76      | 0.54           | 17678.54    | `off`
-GPU:2 | 128        | 5.91      | 0.30           | 25435.14    | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 128 | 3.76 | 0.54 | 17678.54 |
+`off` | | GPU:2 | 128 | 5.91 | 0.30 | 25435.14 | `mirrored` |
 
 #### Antirectifier benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
-:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
-CPU   | 512        | 6.77      | 1.79           | 30916.39    | `off`
-GPU:2 | 512        | 6.81      | 0.66           | 66563.17    | `mirrored`
+| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
+Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
+| :---------: | :-------------------: | | CPU | 512 | 6.77 | 1.79 | 30916.39 |
+`off` | | GPU:2 | 512 | 6.81 | 0.66 | 66563.17 | `mirrored` |
 
 #### IRNN benchmark
 
-      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
 :---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
 CPU   | 1024       | 213.00    | 69.01          | 868.08      | `off`
 GPU:2 | 1024       | 92.71     | 29.12          | 2042.94     | `mirrored`
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
index 25a81cc41cc..66246d834db 100644
--- a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
@@ -28,6 +28,7 @@ py_library(
     srcs = ["saved_model_benchmark_util.py"],
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -41,6 +42,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -54,6 +56,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -67,6 +70,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -80,6 +84,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -93,6 +98,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -106,6 +112,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -119,6 +126,7 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
 
@@ -132,5 +140,6 @@ cuda_py_test(
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/benchmarks:profiler_lib",
     ],
 )
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index ff3eef8b6e9..3469ccb68ef 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -242,7 +242,8 @@ class CallbackList(object):
 
     # Performance check: Check batch hooks for slowness compared to batch time.
     # Only run check for custom callbacks (i.e. not present in this file).
-    self._check_timing = self.__class__ not in globals()
+    self._check_timing = any([cbk.__class__.__name__ not in globals()
+                              for cbk in self.callbacks])
     self._num_batches_for_timing_check = 5
     self._hook_times = {}
     self._batch_start_time = None
@@ -321,7 +322,7 @@ class CallbackList(object):
       avg_begin_hook_time = sum(self._hook_times[begin_hook_name]) / len(
           self._hook_times[begin_hook_name])
 
-      threshold_time = 1.5 * avg_batch_time
+      threshold_time = 1.0 * avg_batch_time
       warning_msg = ('Callback method `{hook}` is slow compared to '
                      'the batch time (batch time: {batch_time:.4f}s vs '
                      '`{hook}` time: {hook_time:.4f}s). Check your callbacks.')
@@ -1370,6 +1371,8 @@ class ModelCheckpoint(Callback):
           raise IOError('Please specify a non-directory filepath for '
                         'ModelCheckpoint. Filepath used is an existing '
                         'directory: {}'.format(filepath))
+        # Re-throw the error for any other causes.
+        raise e
 
   def _get_file_path(self, epoch, logs):
     """Returns the file path for checkpoint."""
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 828c78ebf15..9fd8bf86609 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -282,7 +282,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     class SleepCallback(keras.callbacks.Callback):
 
       def on_train_batch_end(self, batch, logs=None):
-        time.sleep(1)
+        time.sleep(0.1)
 
     model = sequential.Sequential()
     model.add(keras.layers.Dense(1))
@@ -298,17 +298,17 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with test.mock.patch.object(logging, 'warning', warning):
       model.fit(
-          np.ones((20, 1), 'float32'),
-          np.ones((20, 1), 'float32'),
+          np.ones((16, 1), 'float32'),
+          np.ones((16, 1), 'float32'),
           batch_size=3,
-          epochs=10,
+          epochs=1,
           callbacks=[SleepCallback()])
     warning_msg = ('Callback method `on_train_batch_end` is slow compared '
                    'to the batch time')
     self.assertIn(warning_msg, '\n'.join(warning_messages))
 
   @keras_parameterized.run_all_keras_modes
-  def test__default_callbacks_no_warning(self):
+  def test_default_callbacks_no_warning(self):
     # Test that without the callback no warning is raised
     model = sequential.Sequential()
     model.add(keras.layers.Dense(1))
@@ -324,10 +324,10 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     with test.mock.patch.object(logging, 'warning', warning):
       model.fit(
-          np.ones((20, 1), 'float32'),
-          np.ones((20, 1), 'float32'),
+          np.ones((16, 1), 'float32'),
+          np.ones((16, 1), 'float32'),
           batch_size=3,
-          epochs=10)
+          epochs=1)
     self.assertListEqual(warning_messages, [])
 
   @keras_parameterized.run_with_all_model_types(exclude_models='functional')
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 2a0421cf998..748ab7ce0f4 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -417,7 +417,7 @@ distribute_py_test(
     srcs = ["keras_embedding_model_correctness_test.py"],
     full_precision = True,
     main = "keras_embedding_model_correctness_test.py",
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "multi_and_single_gpu",
         "no_windows_gpu",
@@ -468,6 +468,21 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "keras_models_test",
+    srcs = ["keras_models_test.py"],
+    main = "keras_models_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "keras_rnn_model_correctness_test",
     size = "medium",
@@ -479,6 +494,8 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
+        "no_cuda11",
+        "no_oss",
         "no_windows_gpu",
         "notpu",  # TODO(b/153672562)
         "notsan",
@@ -741,6 +758,7 @@ py_test(
     tags = [
         "noasan",  # TODO(b/156029134)
         "nomsan",  # TODO(b/156029134)
+        "notap",  # TODO(b/165865820): restore when not flaky
         "notsan",  # TODO(b/156029134)
     ],
     deps = [
@@ -813,3 +831,30 @@ distribute_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "parameter_server_training_test",
+    srcs = ["parameter_server_training_test.py"],
+    python_version = "PY3",
+    shard_count = 1,
+    tags = [
+        "no_oss",  # TODO(b/162119374): enable it in OSS.
+    ],
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute/client:parameter_server_client",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+)
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
index b680960429c..b6b92391cef 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
@@ -52,15 +52,17 @@ class CustomModel(module.Module):
     return x
 
 
+@combinations.generate(
+    combinations.combine(
+        distribution=(strategy_combinations.all_strategies +
+                      strategy_combinations.multiworker_strategies),
+        mode=["eager"]
+        )
+    )
 class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_single_keras_layer_experimental_run(self, distribution):
-    dataset = self._get_dataset()
+  def test_single_keras_layer_run(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -72,7 +74,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         return grads
 
@@ -83,72 +85,33 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_model_creation_experimental_run(self, distribution):
-    dataset = self._get_dataset()
+  def test_keras_model_optimizer_run(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
-      model = self._get_model()
-
-    @def_function.function
-    def train_step(iterator):
-      def step_fn(inputs):
-        images, targets = inputs
-        with backprop.GradientTape() as tape:
-          outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
-        grads = tape.gradient(loss, model.variables)
-        return grads
-
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
-      return nest.map_structure(distribution.experimental_local_results,
-                                outputs)
-
-    train_step(input_iterator)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_model_optimizer_experimental_run(self, distribution):
-    dataset = self._get_dataset()
-    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
-
-    with distribution.scope():
-      model = self._get_model()
+      model = _get_model()
       optimizer = keras.optimizer_v2.rmsprop.RMSprop()
 
     @def_function.function
-    def train_step(iterator):
+    def train_step(replicated_inputs):
       def step_fn(inputs):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
+      outputs = distribution.run(step_fn, args=(replicated_inputs,))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
 
-    train_step(input_iterator)
+    for x in input_iterator:
+      train_step(x)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_subclass_model_optimizer_experimental_run(self, distribution):
+  def test_keras_subclass_model_optimizer_run(self, distribution):
     def get_subclass_model():
 
       class KerasSubclassModel(keras.Model):
@@ -161,7 +124,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
           return self.l(x)
 
       return KerasSubclassModel()
-    dataset = self._get_dataset()
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -174,29 +137,23 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.run(
-          step_fn, args=(next(iterator),))
+      outputs = distribution.run(step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_keras_model_optimizer_experimental_run_loop(self, distribution):
-    dataset = self._get_dataset()
+  def test_keras_model_optimizer_run_loop(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
-      model = self._get_model()
+      model = _get_model()
       optimizer = keras.optimizer_v2.rmsprop.RMSprop()
 
     @def_function.function
@@ -205,27 +162,22 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      for _ in range(5):
+      for _ in math_ops.range(4):
         distribution.run(step_fn, args=(next(iterator),))
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
   def test_batch_norm_with_dynamic_batch(self, distribution):
     inputs = np.zeros((10, 3, 3, 3), dtype=np.float32)
     targets = np.zeros((10, 4), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
     dataset = dataset.repeat()
-    dataset = dataset.batch(10, drop_remainder=False)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -242,7 +194,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images, training=True)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
@@ -251,39 +203,13 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]))
-  def test_model_predict_with_dynamic_batch(self, distribution):
-    input_data = np.random.random([1, 32, 64, 64, 3])
-    input_shape = tuple(input_data.shape[1:])
-
-    def build_model():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.ConvLSTM2D(
-              4,
-              kernel_size=(4, 4),
-              activation="sigmoid",
-              padding="same",
-              input_shape=input_shape))
-      model.add(keras.layers.GlobalMaxPooling2D())
-      model.add(keras.layers.Dense(2, activation="sigmoid"))
-      return model
-
-    with distribution.scope():
-      model = build_model()
-      model.compile(loss="binary_crossentropy", optimizer="adam")
-      result = model.predict(input_data)
-      self.assertEqual(result.shape, (1, 2))
-
+  # TODO(b/165912857): Re-enable.
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           mode=["eager"]
       ))
-  def test_lstm(self, distribution):
+  def DISABLED_test_lstm(self, distribution):
 
     batch_size = 32
 
@@ -304,7 +230,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     x, y = create_lstm_data()
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.batch(batch_size)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -331,9 +257,6 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_nested_tf_functions(self, distribution):
     # The test builds two computations with keras layers, one with nested
     # tf.function, and the other without nested tf.function. We run these
@@ -343,7 +266,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     inputs = np.random.random((10, 3)).astype(np.float32)
     targets = np.ones((10, 4), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10, drop_remainder=True)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     def get_model():
@@ -366,7 +289,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     def compute_loss(images, targets):
       outputs = model(images)
-      return math_ops.reduce_sum(outputs - targets)
+      return keras.losses.mean_squared_error(targets, outputs)
 
     @def_function.function
     def train_step_without_nested_tf_function(inputs):
@@ -383,7 +306,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     @def_function.function
     def compute_loss2(images, targets):
       outputs = model2(images)
-      return math_ops.reduce_sum(outputs - targets)
+      return keras.losses.mean_squared_error(targets, outputs)
 
     @def_function.function
     def train_step_with_nested_tf_function(inputs):
@@ -406,14 +329,11 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     for model_v, model2_v in zip(model.variables, model2.variables):
       self.assertAllClose(model_v.numpy(), model2_v.numpy())
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_nested_tf_functions_with_control_flow(self, distribution):
     inputs = np.random.random((10, 3)).astype(np.float32)
     targets = np.ones((10, 4), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
-    dataset = dataset.batch(10, drop_remainder=True)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     def get_model():
@@ -433,7 +353,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
 
@@ -446,13 +366,8 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_steps(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_customized_tf_module_experimental_run(self, distribution):
-    dataset = self._get_dataset()
+  def test_customized_tf_module_run(self, distribution):
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -465,7 +380,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         return grads
 
@@ -476,14 +391,11 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_reduce_loss(self, distribution):
     inputs = np.zeros((10, 4), dtype=np.float32)
     targets = np.zeros((10, 1), dtype=np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.batch(10, drop_remainder=False)
+    dataset = dataset.batch(10)
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     with distribution.scope():
@@ -505,11 +417,14 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     loss = train_step(input_iterator)
     loss = distribution.reduce(reduce_util.ReduceOp.MEAN, loss, axis=0)
 
+
+class KerasModelsXLATest(test.TestCase, parameterized.TestCase):
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
   def test_tf_function_experimental_compile(self, distribution):
-    dataset = self._get_dataset()
+    dataset = _get_dataset()
     input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
 
     class CustomDense(keras.layers.Layer):
@@ -537,7 +452,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         images, targets = inputs
         with backprop.GradientTape() as tape:
           outputs = model(images)
-          loss = math_ops.reduce_sum(outputs - targets)
+          loss = keras.losses.mean_squared_error(targets, outputs)
         grads = tape.gradient(loss, model.variables)
         return grads
 
@@ -548,20 +463,21 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  def _get_dataset(self):
-    inputs = np.zeros((10, 3), dtype=np.float32)
-    targets = np.zeros((10, 4), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10, drop_remainder=True)
-    return dataset
 
-  def _get_model(self):
-    x = keras.layers.Input(shape=(3,), name="input")
-    y = keras.layers.Dense(4, name="dense")(x)
-    model = keras.Model(x, y)
-    return model
+def _get_dataset():
+  inputs = np.zeros((31, 3), dtype=np.float32)
+  targets = np.zeros((31, 4), dtype=np.float32)
+  dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+  dataset = dataset.batch(10)
+  return dataset
+
+
+def _get_model():
+  x = keras.layers.Input(shape=(3,), name="input")
+  y = keras.layers.Dense(4, name="dense")(x)
+  model = keras.Model(x, y)
+  return model
 
 
 if __name__ == "__main__":
-  test.main()
+  combinations.main()
diff --git a/tensorflow/python/keras/distribute/keras_models_test.py b/tensorflow/python/keras/distribute/keras_models_test.py
new file mode 100644
index 00000000000..da58c04d335
--- /dev/null
+++ b/tensorflow/python/keras/distribute/keras_models_test.py
@@ -0,0 +1,60 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras high level APIs, e.g. fit, evaluate and predict."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import test
+
+
+class KerasModelsTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_lstm_model_with_dynamic_batch(self, distribution):
+    input_data = np.random.random([1, 32, 64, 64, 3])
+    input_shape = tuple(input_data.shape[1:])
+
+    def build_model():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.ConvLSTM2D(
+              4,
+              kernel_size=(4, 4),
+              activation="sigmoid",
+              padding="same",
+              input_shape=input_shape))
+      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Dense(2, activation="sigmoid"))
+      return model
+
+    with distribution.scope():
+      model = build_model()
+      model.compile(loss="binary_crossentropy", optimizer="adam")
+      result = model.predict(input_data)
+      self.assertEqual(result.shape, (1, 2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/distribute/parameter_server_training_test.py b/tensorflow/python/keras/distribute/parameter_server_training_test.py
new file mode 100644
index 00000000000..12a7db44b76
--- /dev/null
+++ b/tensorflow/python/keras/distribute/parameter_server_training_test.py
@@ -0,0 +1,215 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ParameterServerClient and Keras models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import tempfile
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute.client import parameter_server_client
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import loss_reduction
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+def make_client(num_workers, num_ps):
+  cluster_def = multi_worker_test_base.create_in_process_cluster(
+      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
+  cluster_def["chief"] = [
+      "localhost:%d" % multi_worker_test_base.pick_unused_port()
+  ]
+  cluster_resolver = SimpleClusterResolver(
+      ClusterSpec(cluster_def), rpc_layer="grpc")
+  return parameter_server_client.ParameterServerClient(cluster_resolver)
+
+
+class KPLTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(KPLTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+
+  def testTrainAndServe(self):
+    # These vocabularies usually come from TFT or a Beam pipeline.
+    feature_vocab = [
+        "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
+        "wonder_woman"
+    ]
+    label_vocab = ["yes", "no"]
+
+    with self.client.strategy.scope():
+
+      # Define KPLs under client's context. Right now, if they have look up
+      # tables, they will be created on the client. Their variables will be
+      # created on PS. Ideally they should be cached on each worker since they
+      # will not be changed in a training step.
+      feature_lookup_layer = string_lookup.StringLookup()
+      raw_feature_input = keras.layers.Input(
+          shape=(3,), dtype=dtypes.string, name="feature", ragged=True)
+      feature_id_input = feature_lookup_layer(raw_feature_input)
+
+      # Model creates variables as well.
+      feature_ps = keras.Model({"features": raw_feature_input},
+                               feature_id_input)
+
+      # TODO(yuefengz): adapt may be expensive for large vocab?
+      feature_lookup_layer.adapt(feature_vocab)
+
+      label_lookup_layer = string_lookup.StringLookup(
+          num_oov_indices=0, mask_token=None)
+      raw_label_input = keras.layers.Input(
+          shape=(), dtype=dtypes.string, name="label")
+      label_id_input = label_lookup_layer(raw_label_input)
+      label_ps = keras.Model({"label": raw_label_input}, label_id_input)
+
+      label_lookup_layer.adapt(label_vocab)
+
+      # Only needed for serving.
+      label_inverse_lookup_layer = string_lookup.StringLookup(
+          num_oov_indices=1,
+          mask_token=None,
+          vocabulary=label_lookup_layer.get_vocabulary(),
+          invert=True)
+
+      def dataset_fn():
+
+        def feature_and_label_gen():
+          while True:
+            features = random.sample(feature_vocab, 3)
+            label = "yes" if "avenger" in features else "no"
+            yield {"features": features, "label": label}
+
+        # The dataset will be created on the client?
+        raw_dataset = dataset_ops.Dataset.from_generator(
+            feature_and_label_gen,
+            output_types={
+                "features": dtypes.string,
+                "label": dtypes.string
+            }).shuffle(200).batch(32)
+        preproc_dataset = raw_dataset.map(
+            lambda x: {  # pylint: disable=g-long-lambda
+                "features": feature_ps(x["features"]),
+                "label": label_ps(x["label"])
+            })
+        train_dataset = preproc_dataset.map(lambda x: (  # pylint: disable=g-long-lambda
+            {
+                "features": x["features"]
+            }, [x["label"]]))
+        return train_dataset
+
+      distributed_dataset = self.client.create_per_worker_dataset(dataset_fn)
+
+      model_input = keras.layers.Input(
+          shape=(3,), dtype=dtypes.int64, name="model_input")
+      emb_output = keras.layers.Embedding(
+          input_dim=len(feature_lookup_layer.get_vocabulary()), output_dim=20)(
+              model_input)
+      emb_output = math_ops.reduce_mean(emb_output, axis=1)
+      dense_output = keras.layers.Dense(
+          units=1, activation="sigmoid")(
+              emb_output)
+      model = keras.Model({"features": model_input}, dense_output)
+      optimizer = rmsprop.RMSprop(learning_rate=0.01)
+      accuracy = keras.metrics.Accuracy()
+
+      @def_function.function
+      def worker_fn(iterator):
+
+        def train_step(iterator):
+          batch_data, labels = next(iterator)
+          with backprop.GradientTape() as tape:
+            pred = model(batch_data, training=True)
+            loss = nn.compute_average_loss(
+                keras.losses.BinaryCrossentropy(
+                    reduction=loss_reduction.ReductionV2.NONE)(labels, pred))
+            gradients = tape.gradient(loss, model.trainable_variables)
+
+          optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+
+          actual_pred = math_ops.cast(math_ops.greater(pred, 0.5), dtypes.int64)
+          accuracy.update_state(labels, actual_pred)
+
+        self.client._strategy.run(train_step, args=(iterator,))
+
+    distributed_iterator = iter(distributed_dataset)
+    for _ in range(10):
+      self.client.schedule(worker_fn, args=(distributed_iterator,))
+    self.client.join()
+    self.assertGreater(accuracy.result().numpy(), 0.0)
+
+    # Create a saved model.
+    model.feature_ps = feature_ps
+    model.label_ps = label_ps
+    model.label_inverse_lookup_layer = label_inverse_lookup_layer
+
+    def create_serving_signature(model):
+
+      @def_function.function
+      def serve_fn(raw_features):
+        raw_features = array_ops.expand_dims(raw_features, axis=0)
+        transformed_features = model.feature_ps(raw_features)
+        outputs = model(transformed_features)
+        outputs = array_ops.squeeze(outputs, axis=0)
+        outputs = math_ops.cast(math_ops.greater(outputs, 0.5), dtypes.int64)
+        decoded_outputs = model.label_inverse_lookup_layer(outputs)
+        return array_ops.squeeze(decoded_outputs, axis=0)
+
+      # serving does NOT have batch dimension
+      return serve_fn.get_concrete_function(
+          tensor_spec.TensorSpec(
+              shape=(3), dtype=dtypes.string, name="example"))
+
+    serving_fn = create_serving_signature(model)
+
+    saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    model.save(saved_model_dir, signatures={"serving_default": serving_fn})
+
+    # Test the saved_model.
+    loaded_serving_fn = keras.saving.save.load_model(
+        saved_model_dir).signatures["serving_default"]
+
+    # check the result w/ and w/o avenger.
+    prediction0 = loaded_serving_fn(
+        constant_op.constant(["avenger", "ironman", "avenger"]))["output_0"]
+    self.assertIn(prediction0, ("yes", "no"))
+
+    prediction1 = loaded_serving_fn(
+        constant_op.constant(["ironman", "ironman", "unkonwn"]))["output_0"]
+    self.assertIn(prediction1, ("yes", "no"))
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index c01c3d96aec..a9c863cbc9e 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -1757,7 +1757,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if not call_context.frozen:
       for update in nest.flatten(updates):
         if callable(update):
-          update()
+          update()  # pylint: disable=not-callable
 
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
@@ -2926,14 +2926,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                               self._call_accepts_kwargs)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_full_argspec(self):
     # Argspec inspection is expensive and the call spec is used often, so it
     # makes sense to cache the result.
     return tf_inspect.getfullargspec(self.call)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_args(self):
     all_args = self._call_full_argspec.args
     # Scrub `self` that appears if a decorator was applied.
@@ -2942,7 +2942,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return all_args
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_arg_defaults(self):
     call_fn_args = self._call_fn_args
     call_fn_defaults = self._call_full_argspec.defaults or []
@@ -2955,7 +2955,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return defaults
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_arg_positions(self):
     call_fn_arg_positions = dict()
     for pos, arg in enumerate(self._call_fn_args):
@@ -2963,7 +2963,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return call_fn_arg_positions
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index f047d84d16a..536efb52ad1 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -2342,14 +2342,14 @@ class Layer(base_layer.Layer):
                               self._call_accepts_kwargs)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_full_argspec(self):
     # Argspec inspection is expensive and the call spec is used often, so it
     # makes sense to cache the result.
     return tf_inspect.getfullargspec(self.call)
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_args(self):
     all_args = self._call_full_argspec.args
     # Scrub `self` that appears if a decorator was applied.
@@ -2358,7 +2358,7 @@ class Layer(base_layer.Layer):
     return all_args
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_fn_arg_positions(self):
     call_fn_arg_positions = dict()
     for pos, arg in enumerate(self._call_fn_args):
@@ -2366,12 +2366,12 @@ class Layer(base_layer.Layer):
     return call_fn_arg_positions
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _should_compute_mask(self):
     return ('mask' in self._call_fn_args or
             getattr(self, 'compute_mask', None) is not None)
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 67fb5bb2cb1..e1399ba6777 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -107,7 +107,16 @@ class Functional(training_lib.Model):
   ))
 
   @trackable.no_automatic_dependency_tracking
-  def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
+  def __init__(self, inputs=None, outputs=None, name=None, trainable=True,
+               **kwargs):
+    # This is used by the Model class, since we have some logic to swap the
+    # class in the __new__ method, which will lead to __init__ get invoked
+    # twice. Using the skip_init to skip one of the invocation of __init__ to
+    # avoid any side effects
+    skip_init = kwargs.pop('skip_init', False)
+    if skip_init:
+      return
+    generic_utils.validate_kwargs(kwargs, {})
     super(Functional, self).__init__(name=name, trainable=trainable)
     self._init_graph_network(inputs, outputs)
 
@@ -486,6 +495,19 @@ class Functional(training_lib.Model):
     # Return shapes as TensorShapes.
     return output_shapes
 
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      cls_name = self.__class__.__name__
+      if self.__class__ == Functional:
+        # Hide the functional class name from user, since its not a public
+        # visible class. Use "Model" instead,
+        cls_name = 'Model'
+      self._name = backend.unique_object_name(
+          generic_utils.to_snake_case(cls_name),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
   def _run_internal_graph(self, inputs, training=None, mask=None):
     """Computes output tensors for new inputs.
 
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index dc87098d71f..63e735810fc 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -60,6 +60,18 @@ except ImportError:
 
 class NetworkConstructionTest(keras_parameterized.TestCase):
 
+  def test_default_model_name(self):
+    inputs = input_layer_lib.Input(shape=(1,))
+    outputs = layers.Dense(1, activation='relu')(inputs)
+    model = training_lib.Model(inputs=inputs, outputs=outputs)
+    self.assertEqual(model.name, 'model')
+
+    model_2 = training_lib.Model(inputs=inputs, outputs=outputs)
+    self.assertEqual(model_2.name, 'model_1')
+
+    model_3 = training_lib.Model(inputs=inputs, outputs=outputs)
+    self.assertEqual(model_3.name, 'model_2')
+
   def test_get_updates(self):
 
     class MyLayer(layers.Layer):
@@ -1408,11 +1420,11 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     outputs = layers.Dense(4)(inputs)
 
     with self.assertRaisesRegex(TypeError,
-                                'got an unexpected keyword argument'):
+                                'Keyword argument not understood'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
     with self.assertRaisesRegex(TypeError,
-                                'got an unexpected keyword argument'):
+                                'Keyword argument not understood'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 6f479655f30..9cb35ff1e88 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -211,7 +211,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     if is_functional_model_init_params(args, kwargs) and cls == Model:
       # Functional model
       from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
-      return functional.Functional(*args, **kwargs)
+      return functional.Functional(skip_init=True, *args, **kwargs)
     else:
       return super(Model, cls).__new__(cls, *args, **kwargs)
 
@@ -780,8 +780,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
       def train_function(iterator):
         """Runs a training execution with multiple steps."""
-        outputs = step_function(self, iterator)
-        for _ in math_ops.range(self._steps_per_execution - 1):
+        for _ in math_ops.range(self._steps_per_execution):
           outputs = step_function(self, iterator)
         return outputs
 
@@ -1201,8 +1200,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
       def test_function(iterator):
         """Runs an evaluation execution with multiple steps."""
-        outputs = step_function(self, iterator)
-        for _ in math_ops.range(self._steps_per_execution - 1):
+        for _ in math_ops.range(self._steps_per_execution):
           outputs = step_function(self, iterator)
         return outputs
 
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index e3497c59061..6458d097f62 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -813,7 +813,10 @@ cuda_py_test(
     srcs = ["lstm_v2_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_cuda11"],
+    tags = [
+        "no_cuda11",
+        "no_oss",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:client_testlib",
@@ -829,7 +832,10 @@ cuda_py_test(
     srcs = ["gru_v2_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_cuda11"],
+    tags = [
+        "no_cuda11",
+        "no_oss",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/layers/multi_head_attention.py b/tensorflow/python/keras/layers/multi_head_attention.py
index 210d6133d58..7ddce8caceb 100644
--- a/tensorflow/python/keras/layers/multi_head_attention.py
+++ b/tensorflow/python/keras/layers/multi_head_attention.py
@@ -61,7 +61,7 @@ def _build_attention_equation(rank, attn_axes):
 
   Args:
     rank: the rank of query, key, value tensors.
-    attn_axes: a list/tuple of axes, [1, rank), that will do attention.
+    attn_axes: a list/tuple of axes, [-1, rank), that will do attention.
 
   Returns:
     Einsum equations.
@@ -381,9 +381,21 @@ class MultiHeadAttention(Layer):
         _build_attention_equation(rank, attn_axes=self._attention_axes))
     norm_axes = tuple(
         range(attn_scores_rank - len(self._attention_axes), attn_scores_rank))
-    self._masked_softmax = advanced_activations.Softmax(axis=norm_axes)
+    self._softmax = advanced_activations.Softmax(axis=norm_axes)
     self._dropout_layer = core.Dropout(rate=self._dropout)
 
+  def _masked_softmax(self, attention_scores, attention_mask=None):
+    # Normalize the attention scores to probabilities.
+    # `attention_scores` = [B, N, T, S]
+    if attention_mask is not None:
+      # The expand dim happens starting from the `num_heads` dimension,
+      # (<batch_dims>, num_heads, <query_attention_dims, key_attention_dims>)
+      mask_expansion_axes = [-len(self._attention_axes) * 2 - 1]
+      for _ in range(len(attention_scores.shape) - len(attention_mask.shape)):
+        attention_mask = array_ops.expand_dims(
+            attention_mask, axis=mask_expansion_axes)
+    return self._softmax(attention_scores, attention_mask)
+
   def _compute_attention(self, query, key, value, attention_mask=None):
     """Applies Dot-product attention with query, key, value tensors.
 
@@ -412,15 +424,6 @@ class MultiHeadAttention(Layer):
     attention_scores = special_math_ops.einsum(self._dot_product_equation, key,
                                                query)
 
-    # Normalize the attention scores to probabilities.
-    # `attention_scores` = [B, N, T, S]
-    if attention_mask is not None:
-      # The expand dim happens starting from the `num_heads` dimension,
-      # (<batch_dims>, num_heads, <query_attention_dims, key_attention_dims>)
-      mask_expansion_axes = [-len(self._attention_axes) * 2 - 1]
-      for _ in range(len(attention_scores.shape) - len(attention_mask.shape)):
-        attention_mask = array_ops.expand_dims(
-            attention_mask, axis=mask_expansion_axes)
     attention_scores = self._masked_softmax(attention_scores, attention_mask)
 
     # This is actually dropping out entire tokens to attend to, which might
diff --git a/tensorflow/python/keras/layers/multi_head_attention_test.py b/tensorflow/python/keras/layers/multi_head_attention_test.py
index 7702a2898c4..a50fefd05ba 100644
--- a/tensorflow/python/keras/layers/multi_head_attention_test.py
+++ b/tensorflow/python/keras/layers/multi_head_attention_test.py
@@ -226,5 +226,30 @@ class MultiHeadAttentionTest(keras_parameterized.TestCase):
         model.predict([query, value, null_mask_data]))
 
 
+class SubclassAttention(multi_head_attention.MultiHeadAttention):
+
+  def _build_attention(self, qkv_rank):
+    pass
+
+  def _compute_attention(self,
+                         query_tensor,
+                         key_tensor,
+                         value_tensor,
+                         attention_mask=None):
+    return value_tensor, None
+
+
+@keras_parameterized.run_all_keras_modes
+class AttentionSubclassTest(keras_parameterized.TestCase):
+
+  def test_initializer(self):
+    """Test with a specified initializer."""
+    test_layer = SubclassAttention(num_heads=12, key_dim=64)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    query = keras.Input(shape=(40, 80))
+    output = test_layer(query, query)
+    self.assertEqual(output.shape.as_list(), [None, 40, 80])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 12013882ff5..d9bac2c2e92 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -30,12 +30,12 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import device_context
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -514,7 +514,7 @@ class BatchNormalizationBase(Layer):
     use_fused_avg_updates = (
         ops.executing_eagerly_outside_functions() and
         isinstance(self.momentum, (float, int)) and
-        device_context.enclosing_tpu_context() is None)
+        enclosing_xla_context() is None)
     if use_fused_avg_updates:
       exponential_avg_factor = 1.0 - self.momentum
     else:
@@ -932,6 +932,23 @@ def replace_in_base_docstring(replacements):
   return string
 
 
+def enclosing_xla_context():
+  """Recursively find and return the XLAControlFlowContext."""
+  graph = ops.get_default_graph()
+  while graph is not None:
+    # pylint: disable=protected-access
+    context_ = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+    while context_ is not None:
+      if isinstance(context_, control_flow_ops.XLAControlFlowContext):
+        return context_
+      context_ = context_.outer_context
+    # This may be a FuncGraph due to defuns or v2 control flow. We need to
+    # find the original graph with the XLAControlFlowContext.
+    graph = getattr(graph, 'outer_graph', None)
+  return None
+
+
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 3e6624bac40..723c541c8ad 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -46,8 +46,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:boosted_trees_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 6f5414d1a9f..e36ed118822 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -17,13 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.util.tf_export import keras_export
 
@@ -43,8 +47,8 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
     Same as input shape.
 
   Attributes:
-    bins: Optional boundary specification. Bins include the left boundary and
-      exclude the right boundary, so `bins=[0., 1., 2.]` generates bins
+    bins: Optional boundary specification. Bins exclude the left boundary and
+      include the right boundary, so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
 
   Examples:
@@ -55,14 +59,17 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
   ...          bins=[0., 1., 2.])
   >>> layer(input)
   <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
-  array([[0, 2, 3, 1],
-         [1, 3, 2, 1]], dtype=int32)>
+  array([[0, 1, 3, 1],
+         [0, 3, 2, 0]], dtype=int32)>
   """
 
   def __init__(self, bins, **kwargs):
     super(Discretization, self).__init__(**kwargs)
     base_preprocessing_layer._kpl_gauge.get_cell("V2").set("Discretization")
-    self.bins = bins
+    # The bucketization op requires a final rightmost boundary in order to
+    # correctly assign values higher than the largest left boundary.
+    # This should not impact intended buckets even if a max value is provided.
+    self.bins = np.append(bins, [np.Inf])
 
   def get_config(self):
     config = {
@@ -83,19 +90,40 @@ class Discretization(base_preprocessing_layer.PreprocessingLayer):
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def call(self, inputs):
+    def _bucketize_op(bins):
+      bins = [gen_math_ops.cast(bins, dtypes.float32)]
+      return lambda inputs: boosted_trees_ops.boosted_trees_bucketize(  # pylint: disable=g-long-lambda
+          float_values=[gen_math_ops.cast(inputs, dtypes.float32)],
+          bucket_boundaries=bins)[0]
+
     if tf_utils.is_ragged(inputs):
       integer_buckets = ragged_functional_ops.map_flat_values(
-          gen_math_ops.Bucketize, input=inputs, boundaries=self.bins)
+          _bucketize_op(array_ops.squeeze(self.bins)),
+          inputs)
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
       return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
-      integer_buckets = gen_math_ops.Bucketize(
-          input=inputs.values, boundaries=self.bins)
+      integer_buckets = boosted_trees_ops.boosted_trees_bucketize(
+          [gen_math_ops.cast(inputs.values, dtypes.float32)],
+          bucket_boundaries=[gen_math_ops.cast(array_ops.squeeze(self.bins),
+                                               dtypes.float32)])[0]
       return sparse_tensor.SparseTensor(
           indices=array_ops.identity(inputs.indices),
           values=integer_buckets,
           dense_shape=array_ops.identity(inputs.dense_shape))
     else:
-      return gen_math_ops.Bucketize(input=inputs, boundaries=self.bins)
+      input_shape = inputs.get_shape()
+      if any(dim is None for dim in input_shape.as_list()[1:]):
+        raise NotImplementedError(
+            "Discretization Layer requires known non-batch shape,"
+            "found {}".format(input_shape))
+
+      reshaped = array_ops.reshape(
+          inputs, [-1, gen_math_ops.prod(input_shape.as_list()[1:], axis=0)])
+
+      return array_ops.reshape(
+          control_flow_ops.vectorized_map(
+              _bucketize_op(array_ops.squeeze(self.bins)), reshaped),
+          array_ops.constant([-1] + input_shape.as_list()[1:]))
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
index aaeef8ea868..27f794c2c0d 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import config
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import discretization
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
@@ -31,7 +32,7 @@ from tensorflow.python.platform import test
 
 @combinations.generate(
     combinations.combine(
-        distribution=strategy_combinations.all_strategies,
+        distribution=strategy_combinations.strategies_minus_tpu,
         mode=["eager", "graph"]))
 class DiscretizationDistributionTest(
     keras_parameterized.TestCase,
@@ -40,11 +41,13 @@ class DiscretizationDistributionTest(
   def test_distribution(self, distribution):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, None]
+    expected_output = [[0, 1, 3, 1], [0, 3, 2, 0]]
+    expected_output_shape = [None, 4]
+
+    config.set_soft_device_placement(True)
 
     with distribution.scope():
-      input_data = keras.Input(shape=(None,))
+      input_data = keras.Input(shape=(4,))
       layer = discretization.Discretization(bins=[0., 1., 2.])
       bucket_data = layer(input_data)
       self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
index 54acf267066..9d04ccc26a5 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
@@ -38,10 +38,10 @@ class DiscretizationTest(keras_parameterized.TestCase,
   def test_bucketize_with_explicit_buckets_integer(self):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, None]
+    expected_output = [[0, 1, 3, 1], [0, 3, 2, 0]]
+    expected_output_shape = [None, 4]
 
-    input_data = keras.Input(shape=(None,))
+    input_data = keras.Input(shape=(4,))
     layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
@@ -54,9 +54,9 @@ class DiscretizationTest(keras_parameterized.TestCase,
     input_array = np.array([[-1, 1, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
 
     expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
-    expected_output_shape = [None, None]
+    expected_output_shape = [None, 4]
 
-    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
     layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
@@ -83,7 +83,7 @@ class DiscretizationTest(keras_parameterized.TestCase,
     input_array = ragged_factory_ops.constant([[-1.5, 1.0, 3.4, .5],
                                                [0.0, 3.0, 1.3]])
 
-    expected_output = [[0, 2, 3, 1], [1, 3, 2]]
+    expected_output = [[0, 1, 3, 1], [0, 3, 2]]
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True)
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index cfaa5a78758..90a73db31d7 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -925,10 +925,17 @@ class RNN(Layer):
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if nest.flatten(self.states)[0] is None:
-      def create_state_variable(state):
-        return K.zeros([batch_size] + tensor_shape.TensorShape(state).as_list())
-      self.states = nest.map_structure(
-          create_state_variable, self.cell.state_size)
+      if getattr(self.cell, 'get_initial_state', None):
+        flat_init_state_values = nest.flatten(self.cell.get_initial_state(
+            inputs=None, batch_size=batch_size,
+            dtype=self.dtype or K.floatx()))
+      else:
+        flat_init_state_values = nest.flatten(_generate_zero_filled_state(
+            batch_size, self.cell.state_size, self.dtype or K.floatx()))
+      flat_states_variables = nest.map_structure(
+          K.variable, flat_init_state_values)
+      self.states = nest.pack_sequence_as(self.cell.state_size,
+                                          flat_states_variables)
       if not nest.is_nested(self.states):
         self.states = [self.states]
     elif states is None:
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index c8785a8eb9e..db8cda90553 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -1487,6 +1487,27 @@ class RNNTest(keras_parameterized.TestCase):
     self.assertAllClose(predict_1, predict_6)
     self.assertAllClose(predict_6, predict_7)
 
+  def test_stateful_rnn_with_customized_get_initial_state(self):
+
+    class TestCell(keras.layers.AbstractRNNCell):
+
+      state_size = 1
+      output_size = 2
+
+      def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+        return np.ones((batch_size, 1), dtype=dtype)
+
+      def call(self, inputs, states):
+        return inputs, states
+
+    layer = keras.layers.RNN(TestCell(), stateful=True, return_state=True)
+    inputs = keras.Input(shape=(10, 2), batch_size=4)
+    model = keras.Model(inputs, layer(inputs))
+    x = np.ones((4, 10, 2), dtype=np.float32)
+    output, state = model.predict(x)
+    self.assertAllClose(output, np.ones((4, 2)))
+    self.assertAllClose(state, np.ones((4, 1)))
+
   def test_input_dim_length(self):
     simple_rnn = keras.layers.SimpleRNN(5, input_length=10, input_dim=8)
     self.assertEqual(simple_rnn._batch_input_shape, (None, 10, 8))
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index a2ed7141608..9794189cf09 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -389,7 +389,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
 
     # TODO(b/162616551): Remove all compat statements after 08/20/2020.
     # This follows b/161915509 and is mainly to test the stateless Case op.
-    if compat.forward_compatible(2020, 8, 20):
+    if compat.forward_compatible(2020, 8, 27):
       # The first two attributes are added to support TFLite use case.
       supportive_attributes = {
           'time_major': time_major,
@@ -483,7 +483,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]
 
-    if compat.forward_compatible(2020, 8, 20):
+    if compat.forward_compatible(2020, 8, 27):
       gru_kwargs = {
           'inputs': inputs,
           'init_h': _read_variable_value(initial_state[0]),
@@ -797,7 +797,7 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
         true_fn=cudnn_gru_fn,
         false_fn=standard_gru_fn)
 
-  if compat.forward_compatible(2020, 8, 20):
+  if compat.forward_compatible(2020, 8, 27):
     # Chooses the implementation dynamicly based on the running device.
     (last_output, outputs, new_h,
      runtime) = control_flow_ops.execute_fn_for_device(
@@ -1141,7 +1141,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    if compat.forward_compatible(2020, 8, 20):
+    if compat.forward_compatible(2020, 8, 27):
       # The first two attributes are added to support TFLite use case.
       supportive_attributes = {
           'time_major': time_major,
@@ -1202,7 +1202,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
-      if compat.forward_compatible(2020, 8, 20):
+      if compat.forward_compatible(2020, 8, 27):
         lstm_kwargs = {
             'inputs':
                 inputs,
@@ -1633,7 +1633,7 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
         true_fn=cudnn_lstm_fn,
         false_fn=stardard_lstm_fn)
 
-  if compat.forward_compatible(2020, 8, 20):
+  if compat.forward_compatible(2020, 8, 27):
     # Chooses the implementation dynamicly based on the running device.
     (last_output, outputs, new_h, new_c,
      runtime) = control_flow_ops.execute_fn_for_device(
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index f75e6af6e30..bda32897fc5 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -1728,12 +1728,13 @@ def cosine_similarity(y_true, y_pred, axis=-1):
 class CosineSimilarity(LossFunctionWrapper):
   """Computes the cosine similarity between labels and predictions.
 
-  Note that it is a negative quantity between -1 and 0, where 0 indicates
-  orthogonality and values closer to -1 indicate greater similarity. This makes
-  it usable as a loss function in a setting where you try to maximize the
-  proximity between predictions and targets. If either `y_true` or `y_pred`
-  is a zero vector, cosine similarity will be 0 regardless of the proximity
-  between predictions and targets.
+  Note that it is a number between -1 and 1. When it is a negative number
+  between -1 and 0, 0 indicates orthogonality and values closer to -1
+  indicate greater similarity. The values closer to 1 indicate greater
+  dissimilarity. This makes it usable as a loss function in a setting
+  where you try to maximize the proximity between predictions and targets.
+  If either `y_true` or `y_pred` is a zero vector, cosine similarity will be 0
+  regardless of the proximity between predictions and targets.
 
   `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index e07db3427ce..162533fb880 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -483,7 +483,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
-      use_policy = getattr(strategy.extended, '_use_policy', False)
+      use_policy = getattr(strategy.extended, '_use_var_policy', False)
       if use_policy:
         self.assertRegex(
             repr(x).replace('\n', ' '),
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 9a317e5d114..d5341006e46 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -49,6 +49,7 @@ py_library(
         "//tensorflow/python/keras:backend_config",
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras/engine:base_layer_utils",
+        "//tensorflow/python/keras/utils:layer_utils",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index 512f55748f6..4a5a8c62bcc 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -30,12 +30,31 @@ from tensorflow.python.util.tf_export import keras_export
 class Ftrl(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the FTRL algorithm.
 
-  See Algorithm 1 of this [paper](
-  https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  See Algorithm 1 of this
+  [paper](https://research.google.com/pubs/archive/41159.pdf).
   This version has support for both online L2 (the L2 penalty given in the paper
   above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
   loss function).
 
+  Initialization:
+  $$t = 0$$
+  $$n_{0} = 0$$
+  $$\sigma_{0} = 0$$
+  $$z_{0} = 0$$
+
+  Update ($$i$$ is variable index, $$\alpha$$ is the learning rate):
+  $$t = t + 1$$
+  $$n_{t,i} = n_{t-1,i} + g_{t,i}^{2}$$
+  $$\sigma_{t,i} = (\sqrt{n_{t,i}} - \sqrt{n_{t-1,i}}) / \alpha$$
+  $$z_{t,i} = z_{t-1,i} + g_{t,i} - \sigma_{t,i} * w_{t,i}$$
+  $$w_{t,i} = - ((\beta+\sqrt{n_{t,i}}) / \alpha + 2 * \lambda_{2})^{-1} *
+              (z_{i} - sgn(z_{i}) * \lambda_{1}) if \abs{z_{i}} > \lambda_{i}
+                                                 else 0$$
+
+  Check the documentation for the l2_shrinkage_regularization_strength
+  parameter for more details when shrinkage is enabled, in which case gradient
+  is replaced with gradient_with_shrinkage.
+
   Args:
     learning_rate: A `Tensor`, floating point value, or a schedule that is a
       `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
@@ -54,6 +73,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
       or equal to zero. This differs from L2 above in that the L2 above is a
       stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
       When input is sparse shrinkage will only happen on the active weights.
+    beta: A float value, representing the beta value from the paper.
     **kwargs: Keyword arguments. Allowed to be one of
       `"clipnorm"` or `"clipvalue"`.
       `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
@@ -72,6 +92,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
                l2_regularization_strength=0.0,
                name='Ftrl',
                l2_shrinkage_regularization_strength=0.0,
+               beta=0.0,
                **kwargs):
     super(Ftrl, self).__init__(name, **kwargs)
 
@@ -100,6 +121,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
     self._set_hyper('learning_rate_power', learning_rate_power)
     self._set_hyper('l1_regularization_strength', l1_regularization_strength)
     self._set_hyper('l2_regularization_strength', l2_regularization_strength)
+    self._set_hyper('beta', beta)
     self._initial_accumulator_value = initial_accumulator_value
     self._l2_shrinkage_regularization_strength = (
         l2_shrinkage_regularization_strength)
@@ -115,22 +137,29 @@ class Ftrl(optimizer_v2.OptimizerV2):
 
   def _prepare_local(self, var_device, var_dtype, apply_state):
     super(Ftrl, self)._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)].update(dict(
-        learning_rate_power=array_ops.identity(
-            self._get_hyper('learning_rate_power', var_dtype)),
-        l1_regularization_strength=array_ops.identity(
-            self._get_hyper('l1_regularization_strength', var_dtype)),
-        l2_regularization_strength=array_ops.identity(
-            self._get_hyper('l2_regularization_strength', var_dtype)),
-        l2_shrinkage_regularization_strength=math_ops.cast(
-            self._l2_shrinkage_regularization_strength, var_dtype)
-        ))
+    apply_state[(var_device, var_dtype)].update(
+        dict(
+            learning_rate_power=array_ops.identity(
+                self._get_hyper('learning_rate_power', var_dtype)),
+            l1_regularization_strength=array_ops.identity(
+                self._get_hyper('l1_regularization_strength', var_dtype)),
+            l2_regularization_strength=array_ops.identity(
+                self._get_hyper('l2_regularization_strength', var_dtype)),
+            beta=array_ops.identity(self._get_hyper('beta', var_dtype)),
+            l2_shrinkage_regularization_strength=math_ops.cast(
+                self._l2_shrinkage_regularization_strength, var_dtype)))
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
     coefficients = ((apply_state or {}).get((var_device, var_dtype))
                     or self._fallback_apply_state(var_device, var_dtype))
 
+    # Adjust L2 regularization strength to include beta to avoid the underlying
+    # TensorFlow ops needing to include it.
+    adjusted_l2_regularization_strength = (
+        coefficients['l2_regularization_strength'] + coefficients['beta'] /
+        (2. * coefficients['lr_t']))
+
     accum = self.get_slot(var, 'accumulator')
     linear = self.get_slot(var, 'linear')
 
@@ -142,7 +171,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           grad=grad,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
     else:
@@ -153,7 +182,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           grad=grad,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
@@ -163,6 +192,12 @@ class Ftrl(optimizer_v2.OptimizerV2):
     coefficients = ((apply_state or {}).get((var_device, var_dtype))
                     or self._fallback_apply_state(var_device, var_dtype))
 
+    # Adjust L2 regularization strength to include beta to avoid the underlying
+    # TensorFlow ops needing to include it.
+    adjusted_l2_regularization_strength = (
+        coefficients['l2_regularization_strength'] + coefficients['beta'] /
+        (2. * coefficients['lr_t']))
+
     accum = self.get_slot(var, 'accumulator')
     linear = self.get_slot(var, 'linear')
 
@@ -175,7 +210,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           indices=indices,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
     else:
@@ -187,7 +222,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
           indices=indices,
           lr=coefficients['lr_t'],
           l1=coefficients['l1_regularization_strength'],
-          l2=coefficients['l2_regularization_strength'],
+          l2=adjusted_l2_regularization_strength,
           l2_shrinkage=coefficients['l2_shrinkage_regularization_strength'],
           lr_power=coefficients['learning_rate_power'],
           use_locking=self._use_locking)
@@ -207,9 +242,9 @@ class Ftrl(optimizer_v2.OptimizerV2):
             self._serialize_hyperparameter('l1_regularization_strength'),
         'l2_regularization_strength':
             self._serialize_hyperparameter('l2_regularization_strength'),
+        'beta':
+            self._serialize_hyperparameter('beta'),
         'l2_shrinkage_regularization_strength':
             self._l2_shrinkage_regularization_strength,
-        'beta':
-            0.0,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
index 9b17c0013e1..6627fc0df29 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl_test.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -156,6 +156,63 @@ class FtrlOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.93460727, -1.86147261]), v1_val)
 
+  def testFtrlWithBeta(self):
+    # TODO(tanzheny, omalleyt): Fix test in eager mode.
+    for dtype in [dtypes.half, dtypes.float32]:
+      with ops.Graph().as_default(), self.cached_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(3.0, initial_accumulator_value=0.1, beta=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-6.096838, -9.162214]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.717741, -1.425132]), v1_val)
+
+  def testFtrlWithL2_Beta(self):
+    # TODO(tanzheny, omalleyt): Fix test in eager mode.
+    for dtype in [dtypes.half, dtypes.float32]:
+      with ops.Graph().as_default(), self.cached_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.1,
+            beta=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.735487, -4.704625]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.294335, -0.586556]), v1_val)
+
   def testFtrlWithL1_L2(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for dtype in [dtypes.half, dtypes.float32]:
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 466b42a3818..088f56cd17a 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -40,7 +40,7 @@ class SGD(optimizer_v2.OptimizerV2):
 
   ```python
   velocity = momentum * velocity - learning_rate * g
-  w = w * velocity
+  w = w + velocity
   ```
 
   When `nesterov=False`, this rule becomes:
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index c533b2c40c1..e6b4458ca8d 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -39,6 +39,7 @@ from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import utils as optimizer_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -48,7 +49,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -1207,12 +1207,12 @@ class OptimizerV2(trackable.Trackable):
       return x.value()
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _dense_apply_args(self):
     return tf_inspect.getfullargspec(self._resource_apply_dense).args
 
   @property
-  @tracking.cached_per_instance
+  @layer_utils.cached_per_instance
   def _sparse_apply_args(self):
     return tf_inspect.getfullargspec(self._resource_apply_sparse).args
 
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 62000be42d9..2e1b6cdd9f7 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -177,7 +177,7 @@ tf_py_test(
     size = "medium",
     srcs = ["saved_model/revive_test.py"],
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "no_windows",  # b/158005583
     ],
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 31c9a6e14e0..c7709544563 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -34,8 +34,10 @@ from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 
+
 # pylint: disable=g-import-not-at-top
 try:
   import h5py
@@ -99,6 +101,11 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
       if not proceed:
         return
 
+    # Try creating dir if not exist
+    dirpath = os.path.dirname(filepath)
+    if not os.path.exists(dirpath):
+      gfile.MakeDirs(dirpath)
+
     f = h5py.File(filepath, mode='w')
     opened_new_file = True
   else:
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index dea492db4dc..92296b58023 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -730,6 +730,45 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  def test_model_saving_to_new_dir_path(self):
+    saved_model_dir = os.path.join(self._save_model_dir(), 'newdir',
+                                   'saved_model')
+    save_format = testing_utils.get_save_format()
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      out = model.predict(x)
+
+      keras.models.save_model(model, saved_model_dir, save_format=save_format)
+
+      new_model = keras.models.load_model(saved_model_dir)
+      self._assert_same_weights_and_metrics(model, new_model)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+  def test_model_raise_exception_with_failed_saving(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    saved_model_dir = self._save_model_dir()
+    saved_model_path = os.path.join(saved_model_dir, 'saved_model.h5')
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      with self.assertRaisesRegex(OSError, 'Unable to create file'):
+        with h5py.File(saved_model_path, 'w'):
+          keras.models.save_model(model, saved_model_path)
+
   def test_saving_constant_initializer_with_numpy(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index c0160609ef4..4889ee97211 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -380,8 +380,11 @@ class KerasObjectLoader(tf_load.Loader):
         metadata['class_name'] == 'Sequential' or
         metadata['class_name'] == 'Functional')
     if not (generic_utils.validate_config(config) and
-            model_is_functional_or_sequential):
-      return None  # Revive as custom model.
+            model_is_functional_or_sequential
+           ) or generic_utils.get_registered_object(class_name) is not None:
+      # Model should not be revived as a graph network. Try reviving directly
+      # from config or as a custom model.
+      return None
 
     # Revive functional and sequential models as blank model objects for now (
     # must be initialized to enable setattr tracking and attribute caching).
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 5e94597d00d..786cc947751 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 import os
 import shutil
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
@@ -115,6 +116,36 @@ class CustomLayerWithConfig(CustomLayerNoConfig):
             'name': self.name}
 
 
+class CustomNetworkDefaultConfig(keras.Model):
+
+  def __init__(self, num_classes, name=None):
+    inputs = keras.Input((2, 3), name='inputs')
+    x = keras.layers.Flatten(name='flatten')(inputs)
+    y = keras.layers.Dense(num_classes, name='outputs')(x)
+    super(CustomNetworkDefaultConfig, self).__init__(inputs, y, name=name)
+
+
+class CustomNetworkWithConfig(CustomNetworkDefaultConfig):
+
+  def __init__(self, num_classes, name=None):
+    super(CustomNetworkWithConfig, self).__init__(num_classes, name=name)
+    self._config_dict = dict(num_classes=num_classes)
+
+  def get_config(self):
+    return self._config_dict
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(config['num_classes'], name=config.get('name'))
+
+
+class CustomNetworkWithConfigName(CustomNetworkWithConfig):
+
+  def __init__(self, num_classes, name=None):
+    super(CustomNetworkWithConfigName, self).__init__(num_classes, name=name)
+    self._config_dict['name'] = self.name
+
+
 class TestModelRevive(keras_parameterized.TestCase):
 
   def setUp(self):
@@ -244,17 +275,31 @@ class TestModelRevive(keras_parameterized.TestCase):
     self._assert_revived_correctness(model, revived)
 
   def test_revive_sequential_inputs(self):
-    model = keras.models.Sequential(
-        [keras.Input((None,), dtype=dtypes.string),
-         keras.layers.Lambda(string_ops.string_lower)])
+    model = keras.models.Sequential([
+        keras.Input((None,), dtype=dtypes.string),
+        keras.layers.Lambda(string_ops.string_lower)
+    ])
     model.save(self.path, save_format='tf')
     revived = keras_load.load(self.path)
     self.assertEqual(dtypes.string, revived._layers[0].dtype)
 
+  @parameterized.named_parameters(
+      ('default_config', CustomNetworkDefaultConfig),
+      ('with_config', CustomNetworkWithConfig),
+      ('with_config_name', CustomNetworkWithConfigName))
+  def test_revive_network(self, model_cls):
+    model = model_cls(8)
+    model.save(self.path, include_optimizer=False, save_format='tf')
+    revived = keras_load.load(self.path, compile=False)
+    self._assert_revived_correctness(model, revived)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
   with generic_utils.CustomObjectScope({
       'CustomLayerWithConfig': CustomLayerWithConfig,
-      'SubclassedModelWithConfig': SubclassedModelWithConfig}):
+      'CustomNetworkWithConfig': CustomNetworkWithConfig,
+      'CustomNetworkWithConfigName': CustomNetworkWithConfigName,
+      'SubclassedModelWithConfig': SubclassedModelWithConfig
+  }):
     test.main()
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 899701d624c..38e3c8e66af 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -301,6 +301,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "layer_utils_test",
+    size = "small",
+    srcs = ["layer_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":layer_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/training/tracking",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "np_utils_test",
     size = "small",
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index e33a24b93dd..fc4eeb194e8 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -526,6 +526,8 @@ class Progbar(object):
     self._start = time.time()
     self._last_update = 0
 
+    self._time_after_first_step = None
+
   def update(self, current, values=None, finalize=None):
     """Updates the progress bar.
 
@@ -597,10 +599,7 @@ class Progbar(object):
       self._total_width = len(bar)
       sys.stdout.write(bar)
 
-      if current:
-        time_per_unit = (now - self._start) / current
-      else:
-        time_per_unit = 0
+      time_per_unit = self._estimate_step_duration(current, now)
 
       if self.target is None or finalize:
         if time_per_unit >= 1 or time_per_unit == 0:
@@ -664,6 +663,37 @@ class Progbar(object):
   def add(self, n, values=None):
     self.update(self._seen_so_far + n, values)
 
+  def _estimate_step_duration(self, current, now):
+    """Estimate the duration of a single step.
+
+    Given the step number `current` and the corresponding time `now`
+    this function returns an estimate for how long a single step
+    takes. If this is called before one step has been completed
+    (i.e. `current == 0`) then zero is given as an estimate. The duration
+    estimate ignores the duration of the (assumed to be non-representative)
+    first step for estimates when more steps are available (i.e. `current>1`).
+    Arguments:
+      current: Index of current step.
+      now: The current time.
+    Returns: Estimate of the duration of a single step.
+    """
+    if current:
+      # there are a few special scenarios here:
+      # 1) somebody is calling the progress bar without ever supplying step 1
+      # 2) somebody is calling the progress bar and supplies step one mulitple
+      #    times, e.g. as part of a finalizing call
+      # in these cases, we just fall back to the simple calculation
+      if self._time_after_first_step is not None and current > 1:
+        time_per_unit = (now - self._time_after_first_step) / (current - 1)
+      else:
+        time_per_unit = (now - self._start) / current
+
+      if current == 1:
+        self._time_after_first_step = now
+      return time_per_unit
+    else:
+      return 0
+
 
 def make_batches(size, batch_size):
   """Returns a list of batch indices (tuples of indices).
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index d2d3d919fff..3195bb0eb13 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -19,6 +19,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import weakref
+
 import numpy as np
 import six
 
@@ -404,3 +407,99 @@ def is_builtin_layer(layer):
   # of the base layer class.
   return (layer._keras_api_names != ('keras.layers.Layer',) and
           layer._keras_api_names_v1 != ('keras.layers.Layer',))
+
+
+def cached_per_instance(f):
+  """Lightweight decorator for caching lazily constructed properties.
+
+  When to use:
+  This decorator provides simple caching with minimal overhead. It is designed
+  for properties which are expensive to compute and static over the life of a
+  class instance, and provides no mechanism for cache invalidation. Thus it is
+  best suited for lazily exposing derived properties of other static data.
+
+  For classes with custom getattr / setattr behavior (such as trackable
+  objects), storing cache results as object attributes is not performant.
+  Instead, a specialized cache can significantly reduce property lookup
+  overhead. (While still allowing the decorated property to be lazily computed.)
+  Consider the following class:
+
+  ```
+  class MyClass(object):
+    def __setattr__(self, key, value):
+      # Some expensive class specific code
+      # ...
+      # ...
+
+      super(MyClass, self).__setattr__(key, value)
+
+    @property
+    def thing(self):
+      # `thing` is expensive to compute (and may not even be requested), so we
+      # want to lazily compute it and then cache it.
+      output = getattr(self, '_thing', None)
+      if output is None:
+        self._thing = output = compute_thing(self)
+      return output
+  ```
+
+  It's also worth noting that ANY overriding of __setattr__, even something as
+  simple as:
+  ```
+    def __setattr__(self, key, value):
+      super(MyClass, self).__setattr__(key, value)
+  ```
+
+  Slows down attribute assignment by nearly 10x.
+
+  By contrast, replacing the definition of `thing` with the following sidesteps
+  the expensive __setattr__ altogether:
+
+  '''
+  @property
+  @tracking.cached_per_instance
+  def thing(self):
+    # `thing` is expensive to compute (and may not even be requested), so we
+    # want to lazily compute it and then cache it.
+    return compute_thing(self)
+  '''
+
+  Performance:
+  The overhead for this decorator is ~0.4 us / call. A much lower overhead
+  implementation (~0.085 us / call) can be achieved by using a custom dict type:
+
+  ```
+  def dict_based_cache(f):
+    class Cache(dict):
+      __slots__ = ()
+      def __missing__(self, key):
+        self[key] = output = f(key)
+        return output
+
+    return property(Cache().__getitem__)
+  ```
+
+  However, that implementation holds class instances as keys, and as a result
+  blocks garbage collection. (And modifying it to use weakref's as keys raises
+  the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
+  implementation below turns out to be more prudent.
+
+  Args:
+    f: The function to cache.
+
+  Returns:
+    f decorated with simple caching behavior.
+  """
+
+  cache = weakref.WeakKeyDictionary()
+
+  @functools.wraps(f)
+  def wrapped(item):
+    output = cache.get(item)
+    if output is None:
+      cache[item] = output = f(item)
+    return output
+
+  wrapped.cache = cache
+  return wrapped
+
diff --git a/tensorflow/python/keras/utils/layer_utils_test.py b/tensorflow/python/keras/utils/layer_utils_test.py
new file mode 100644
index 00000000000..a4e53a21aba
--- /dev/null
+++ b/tensorflow/python/keras/utils/layer_utils_test.py
@@ -0,0 +1,170 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for layer_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import contextlib
+import multiprocessing.dummy
+import pickle
+import time
+import timeit
+
+import numpy as np
+
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.tracking import tracking
+
+
+_PICKLEABLE_CALL_COUNT = collections.Counter()
+
+
+class MyPickleableObject(tracking.AutoTrackable):
+  """Needed for InterfaceTests.test_property_cache_serialization.
+
+  This class must be at the top level. This is a constraint of pickle,
+  unrelated to `cached_per_instance`.
+  """
+
+  @property
+  @layer_utils.cached_per_instance
+  def my_id(self):
+    _PICKLEABLE_CALL_COUNT[self] += 1
+    return id(self)
+
+
+class LayerUtilsTest(test.TestCase):
+
+  def test_property_cache(self):
+    test_counter = collections.Counter()
+
+    class MyObject(tracking.AutoTrackable):
+
+      def __init__(self):
+        super(MyObject, self).__init__()
+        self._frozen = True
+
+      def __setattr__(self, key, value):
+        """Enforce that cache does not set attribute on MyObject."""
+        if getattr(self, "_frozen", False):
+          raise ValueError("Cannot mutate when frozen.")
+        return super(MyObject, self).__setattr__(key, value)
+
+      @property
+      @layer_utils.cached_per_instance
+      def test_property(self):
+        test_counter[id(self)] += 1
+        return id(self)
+
+    first_object = MyObject()
+    second_object = MyObject()
+
+    # Make sure the objects return the correct values
+    self.assertEqual(first_object.test_property, id(first_object))
+    self.assertEqual(second_object.test_property, id(second_object))
+
+    # Make sure the cache does not share across objects
+    self.assertNotEqual(first_object.test_property, second_object.test_property)
+
+    # Check again (Now the values should be cached.)
+    self.assertEqual(first_object.test_property, id(first_object))
+    self.assertEqual(second_object.test_property, id(second_object))
+
+    # Count the function calls to make sure the cache is actually being used.
+    self.assertAllEqual(tuple(test_counter.values()), (1, 1))
+
+  def test_property_cache_threaded(self):
+    call_count = collections.Counter()
+
+    class MyObject(tracking.AutoTrackable):
+
+      @property
+      @layer_utils.cached_per_instance
+      def test_property(self):
+        # Random sleeps to ensure that the execution thread changes
+        # mid-computation.
+        call_count["test_property"] += 1
+        time.sleep(np.random.random() + 1.)
+
+        # Use a RandomState which is seeded off the instance's id (the mod is
+        # because numpy limits the range of seeds) to ensure that an instance
+        # returns the same value in different threads, but different instances
+        # return different values.
+        return int(np.random.RandomState(id(self) % (2 ** 31)).randint(2 ** 16))
+
+      def get_test_property(self, _):
+        """Function provided to .map for threading test."""
+        return self.test_property
+
+    # Test that multiple threads return the same value. This requires that
+    # the underlying function is repeatable, as cached_property makes no attempt
+    # to prioritize the first call.
+    test_obj = MyObject()
+    with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
+      # Intentionally make a large pool (even when there are only a small number
+      # of cpus) to ensure that the runtime switches threads.
+      results = pool.map(test_obj.get_test_property, range(64))
+    self.assertEqual(len(set(results)), 1)
+
+    # Make sure we actually are testing threaded behavior.
+    self.assertGreater(call_count["test_property"], 1)
+
+    # Make sure new threads still cache hit.
+    with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
+      start_time = timeit.default_timer()  # Don't time pool instantiation.
+      results = pool.map(test_obj.get_test_property, range(4))
+    total_time = timeit.default_timer() - start_time
+
+    # Note(taylorrobie): The reason that it is safe to time a unit test is that
+    #                    a cache hit will be << 1 second, and a cache miss is
+    #                    guaranteed to be >= 1 second. Empirically confirmed by
+    #                    100,000 runs with no flakes.
+    self.assertLess(total_time, 0.95)
+
+  def test_property_cache_serialization(self):
+    # Reset call count. .keys() must be wrapped in a list, because otherwise we
+    # would mutate the iterator while iterating.
+    for k in list(_PICKLEABLE_CALL_COUNT.keys()):
+      _PICKLEABLE_CALL_COUNT.pop(k)
+
+    first_instance = MyPickleableObject()
+    self.assertEqual(id(first_instance), first_instance.my_id)
+
+    # Test that we can pickle and un-pickle
+    second_instance = pickle.loads(pickle.dumps(first_instance))
+
+    self.assertEqual(id(second_instance), second_instance.my_id)
+    self.assertNotEqual(first_instance.my_id, second_instance.my_id)
+
+    # Make sure de-serialized object uses the cache.
+    self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
+
+    # Make sure the decorator cache is not being serialized with the object.
+    expected_size = len(pickle.dumps(second_instance))
+    for _ in range(5):
+      # Add some more entries to the cache.
+      _ = MyPickleableObject().my_id
+    self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
+    size_check_instance = MyPickleableObject()
+    _ = size_check_instance.my_id
+    self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 5ce8f0935b8..0d6b6ac36a3 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -142,6 +142,10 @@ tf_py_test(
     size = "small",
     srcs = ["map_ops_test.py"],
     grpc_enabled = True,
+    tags = [
+        "noasan",  # TODO(b/164696004)
+        "notsan",  # TODO(b/164696004)
+    ],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
index d5843c1a766..144cc525905 100644
--- a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
@@ -164,7 +164,7 @@ class StatefulScatterNdTest(test.TestCase):
 
   def testSimple(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    for dtype in (dtypes.int64, dtypes.float32, dtypes.float64,
+    for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
       ref = variables.Variable([0, 0, 0, 0, 0, 0, 0, 0], dtype=dtype)
@@ -189,16 +189,17 @@ class StatefulScatterNdTest(test.TestCase):
 
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-    ref = resource_variable_ops.ResourceVariable(
-        [0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
-    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
-    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    for dtype in (dtypes.int32, dtypes.float32):
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
+      ref = resource_variable_ops.ResourceVariable([0, 0, 0, 0, 0, 0, 0, 0],
+                                                   dtype=dtype)
+      expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+      scatter = state_ops.scatter_nd_update(ref, indices, updates)
 
-    with test_util.device(use_gpu=True):
-      self.evaluate(ref.initializer)
-      self.evaluate(scatter)
-      self.assertAllClose(ref, expected)
+      with test_util.device(use_gpu=True):
+        self.evaluate(ref.initializer)
+        self.evaluate(scatter)
+        self.assertAllClose(ref, expected)
 
   def testSimple2(self):
     indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
@@ -368,6 +369,70 @@ class StatefulScatterNdTest(test.TestCase):
     result = self.evaluate(scatter)
     assert np.allclose(result, expected_result)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMin(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 2], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      min_result = state_ops.scatter_min(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, -1, 1, 1, 0, 1, 1, 1])
+      self.assertAllEqual(self.evaluate(min_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMax(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 2], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      max_result = state_ops.scatter_max(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, 1, 1, 2, 1, 1, 1, 2])
+      self.assertAllEqual(self.evaluate(max_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAdd(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 3], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      add_result = state_ops.scatter_add(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, 0, 1, 3, 1, 1, 1, 4])
+      self.assertAllEqual(self.evaluate(add_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSub(self):
+    variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
+    resource_variable = resource_variable_ops.ResourceVariable(
+        array_ops.ones([8], dtype=dtypes.int32))
+    indices = constant_op.constant([4, 3, 1, 7])
+    updates = constant_op.constant([0, 2, -1, 2], dtype=dtypes.int32)
+
+    for ref in (variable, resource_variable):
+      sub_result = state_ops.scatter_sub(ref, indices, updates)
+      self.evaluate(ref.initializer)
+
+      expected_result = constant_op.constant([1, 2, 1, -1, 1, 1, 1, -1])
+      self.assertAllEqual(self.evaluate(sub_result), expected_result)
+      self.assertAllEqual(self.evaluate(ref), expected_result)
+
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
   def _disabledTestScatterOutOfRangeGpu(self):
     if not test.IsBuiltWithCuda():
@@ -714,19 +779,20 @@ class ScatterNdTensorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testUpdateAddSub(self):
-    indices = constant_op.constant([[4], [3], [1], [7]])
-    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-    t = array_ops.ones([8], dtype=dtypes.float32)
-    assigned = array_ops.tensor_scatter_update(t, indices, updates)
-    added = array_ops.tensor_scatter_add(t, indices, updates)
-    subbed = array_ops.tensor_scatter_sub(t, indices, updates)
+    for dtype in (dtypes.int32, dtypes.float32):
+      indices = constant_op.constant([[4], [3], [1], [7]])
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
+      t = array_ops.ones([8], dtype=dtype)
+      assigned = array_ops.tensor_scatter_update(t, indices, updates)
+      added = array_ops.tensor_scatter_add(t, indices, updates)
+      subbed = array_ops.tensor_scatter_sub(t, indices, updates)
 
-    self.assertAllEqual(assigned,
-                        constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
-    self.assertAllEqual(added,
-                        constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
-    self.assertAllEqual(subbed,
-                        constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
+      self.assertAllEqual(assigned,
+                          constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
+      self.assertAllEqual(added,
+                          constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
+      self.assertAllEqual(subbed,
+                          constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
 
   def testUpdateAddSubGradients(self):
     with self.cached_session():
@@ -759,30 +825,33 @@ class ScatterNdTensorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testUpdateMinMax(self):
-    indices = constant_op.constant([[4], [3], [1], [7]])
-    updates = constant_op.constant([0, 2, -1, 1.2], dtype=dtypes.float32)
-    t = array_ops.ones([8], dtype=dtypes.float32)
-    assigned = array_ops.tensor_scatter_update(t, indices, updates)
-    min_result = array_ops.tensor_scatter_min(t, indices, updates)
-    max_result = array_ops.tensor_scatter_max(t, indices, updates)
+    for dtype in (dtypes.int32, dtypes.float32):
+      indices = constant_op.constant([[4], [3], [1], [7]])
+      updates = constant_op.constant([0, 2, -1, 2], dtype=dtype)
+      t = array_ops.ones([8], dtype=dtype)
+      assigned = array_ops.tensor_scatter_update(t, indices, updates)
+      min_result = array_ops.tensor_scatter_min(t, indices, updates)
+      max_result = array_ops.tensor_scatter_max(t, indices, updates)
 
-    self.assertAllEqual(assigned,
-                        constant_op.constant([1, -1, 1, 2, 0, 1, 1, 1.2]))
-    self.assertAllEqual(min_result,
-                        constant_op.constant([1, -1, 1, 1, 0, 1, 1, 1]))
-    self.assertAllEqual(max_result,
-                        constant_op.constant([1, 1, 1, 2, 1, 1, 1, 1.2]))
+      self.assertAllEqual(assigned,
+                          constant_op.constant([1, -1, 1, 2, 0, 1, 1, 2]))
+      self.assertAllEqual(min_result,
+                          constant_op.constant([1, -1, 1, 1, 0, 1, 1, 1]))
+      self.assertAllEqual(max_result,
+                          constant_op.constant([1, 1, 1, 2, 1, 1, 1, 2]))
 
   def testTensorScatterUpdateWithForwarding(self):
-    @def_function.function
-    def _TestFn():
-      indices = constant_op.constant([[4], [3], [1], [7]])
-      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-      t = array_ops.ones([8], dtype=dtypes.float32)
+    for dtype in (dtypes.int32, dtypes.float32):
 
-      return array_ops.tensor_scatter_update(t, indices, updates)
+      @def_function.function
+      def _TestFn():
+        indices = constant_op.constant([[4], [3], [1], [7]])
+        updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)  # pylint: disable=cell-var-from-loop
+        t = array_ops.ones([8], dtype=dtype)  # pylint: disable=cell-var-from-loop
 
-    self.assertAllEqual(_TestFn(), [1, 11, 1, 10, 9, 1, 1, 12])
+        return array_ops.tensor_scatter_update(t, indices, updates)
+
+      self.assertAllEqual(_TestFn(), [1, 11, 1, 10, 9, 1, 1, 12])
 
   @test_util.run_in_graph_and_eager_modes
   def testTensorScatterUpdateWithStrings(self):
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index c6f924daca6..391930e20d5 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -40,6 +41,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
@@ -521,6 +523,11 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
               np_answer = x_np[::-1, :, :]
               self.assertAllEqual(x_tf, np_answer)
 
+  def testReverseInvalidShape(self):
+    x = np.ndarray(shape=[0, 1, 1])
+    v = array_ops.reverse_v2(x, axis=[1])
+    self.assertAllEqual(self.evaluate(v), v)
+
 
 class MeshgridTest(test_util.TensorFlowTestCase):
 
@@ -1142,7 +1149,7 @@ class StridedSliceAssignChecker(object):
       self.test.assertAllEqual(val_copy, valnp)
 
 
-class SliceAssignTest(test_util.TensorFlowTestCase):
+class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def testInvalidSlice(self):
@@ -1228,7 +1235,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         sess.run(v[:].assign(too_small_val))
 
   @test_util.run_in_graph_and_eager_modes
-  def testTensorStridedSliceAssignWithInputForward(self):
+  def testTensorStridedSliceUpdateWithInputForward(self):
     """Tests tensor_strided_slice_update with input-forwarding taking effect."""
     @def_function.function
     def assign(x):
@@ -1237,7 +1244,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([0, 1], self.evaluate(assign(array_ops.zeros([2]))))
 
   @test_util.run_in_graph_and_eager_modes
-  def testTensorStridedSliceAssignNoInputForward(self):
+  def testTensorStridedSliceUpdateNoInputForward(self):
     """Tests tensor_strided_slice_update with no input-forwarding."""
     x = constant_op.constant([0.2, 0.3])
     y = x + 1
@@ -1247,6 +1254,34 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     ans = y + z
     self.assertAllClose([1.6, 2.6], self.evaluate(ans))
 
+  def testTensorStridedSliceUpdateGradSimple(self):
+    original = constant_op.constant([0.2, 0.3])
+    updates = constant_op.constant([0.4])
+    with backprop.GradientTape() as tape:
+      tape.watch([original, updates])
+      updated = gen_array_ops.tensor_strided_slice_update(
+          original, [0], [1], [1], updates)
+    d1, d2 = tape.gradient(updated, [original, updates],
+                           output_gradients=constant_op.constant([2.0, 3.0]))
+    self.assertAllClose([0.0, 3.0], d1)
+    self.assertAllClose([2.0], d2)
+
+  @parameterized.named_parameters(
+      ("_%s" % i, *args) for i, args in enumerate([  # pylint:disable=g-complex-comprehension
+          ([2, 5], [0, 1], [1, 0], [1, 2], [2], 0, 2, 0, 0, 1),
+          ([4], [5], [3], [1], [3], 1, 0, 0, 0, 0),
+          ([2, 2, 3, 2], [0, 0, 1], [1, 0, 2], [1, 0, 1], [2, 3], 0, 0, 2, 0, 5)
+      ]))
+  def testTensorStridedSliceUpdateGrad(
+      self, shape, begin, end, strides, updates_shape, *args):
+    with self.cached_session():
+      def f(a, b):
+        return gen_array_ops.tensor_strided_slice_update(
+            a, begin, end, strides, b, *args)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          f, [array_ops.zeros(shape), array_ops.ones(updates_shape)], delta=1.0)
+      self.assertAllClose(theoretical, numerical)
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index b8829181747..30b20c67fda 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.compat.compat import forward_compatibility_horizon
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -941,6 +940,7 @@ class CondV2Test(test.TestCase):
     self.assertEqual(fn_output[0].op.type, "StatefulPartitionedCall")
     self.assertAllEqual(self.evaluate(fn_output), [2.0, 4.0])
 
+  @test_util.disable_tfrt("GPU to host copy not implemented yet.")
   def testGradientTapeOfCondWithResourceVariableInFunction(self):
     with context.eager_mode():
       v = variables.Variable(2.)
@@ -1606,5 +1606,4 @@ def _has_node_with_op(run_metadata, op_type):
 
 
 if __name__ == "__main__":
-  with forward_compatibility_horizon(2020, 8, 21):
-    test.main()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index ff4da3afc9f..9bd962e75f3 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -189,9 +189,9 @@ class Conv3DTest(test.TestCase):
                 e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-6)
 
   def _CreateNumpyTensor(self, sizes):
-    return np.asarray([f * 1.0
-                       for f in range(1,
-                                      np.prod(sizes) + 1)]).reshape(sizes)
+    return np.asarray([f * 1.0 for f in range(1,
+                                              np.prod(sizes) + 1)],
+                      dtype=np.float32).reshape(sizes)
 
   @test_util.run_in_graph_and_eager_modes
   def testConv3DExpandedBatch(self):
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index ba5770001ad..a2c0c7f63a8 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -45,7 +45,6 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(len(bmp0), 4194)
       self.assertAllEqual(image0, image1)
 
-  @test_util.run_deprecated_v1
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
@@ -76,11 +75,10 @@ class DecodeImageOpTest(test.TestCase):
 
         self.assertAllClose(frame, gt)
 
-        bad_channels = image_ops.decode_image(gif0, channels=1)
         with self.assertRaises(errors_impl.InvalidArgumentError):
+          bad_channels = image_ops.decode_image(gif0, channels=1)
           self.evaluate(bad_channels)
 
-  @test_util.run_deprecated_v1
   def testJpeg(self):
     # Read a real jpeg and verify shape
     path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
@@ -93,8 +91,8 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
-      bad_channels = image_ops.decode_image(jpeg0, channels=4)
       with self.assertRaises(errors_impl.InvalidArgumentError):
+        bad_channels = image_ops.decode_image(jpeg0, channels=4)
         self.evaluate(bad_channels)
 
   def testPng(self):
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 5ed6689e48a..79d5a725552 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -29,15 +29,16 @@ from tensorflow.python.platform import test
 
 class DecodeRawOpTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testShapeInference(self):
-    for dtype in [dtypes.bool, dtypes.int8, dtypes.uint8, dtypes.int16,
-                  dtypes.uint16, dtypes.int32, dtypes.int64, dtypes.float16,
-                  dtypes.float32, dtypes.float64, dtypes.complex64,
-                  dtypes.complex128]:
-      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
-      decode = parsing_ops.decode_raw(in_bytes, dtype)
-      self.assertEqual([None, None], decode.get_shape().as_list())
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      for dtype in [dtypes.bool, dtypes.int8, dtypes.uint8, dtypes.int16,
+                    dtypes.uint16, dtypes.int32, dtypes.int64, dtypes.float16,
+                    dtypes.float32, dtypes.float64, dtypes.complex64,
+                    dtypes.complex128]:
+        in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+        decode = parsing_ops.decode_raw(in_bytes, dtype)
+        self.assertEqual([None, None], decode.get_shape().as_list())
 
   def testToUint8(self):
     self.assertAllEqual(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index e0e6fedd34e..c18456c670d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -144,6 +144,35 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_is_x_parameters(self):
+    matrix = [[1., 0.], [1., 1.]]
+    sub_operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator = block_diag.LinearOperatorBlockDiag(
+        [sub_operator],
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertEqual(
+        operator.parameters,
+        {
+            "name": None,
+            "is_square": True,
+            "is_positive_definite": True,
+            "is_self_adjoint": False,
+            "is_non_singular": True,
+            "operators": [sub_operator],
+        })
+    self.assertEqual(
+        sub_operator.parameters,
+        {
+            "is_non_singular": None,
+            "is_positive_definite": None,
+            "is_self_adjoint": None,
+            "is_square": None,
+            "matrix": matrix,
+            "name": "LinearOperatorFullMatrix",
+        })
+
   def test_block_diag_adjoint_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = block_diag.LinearOperatorBlockDiag(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index c3a3ae9fe8a..1d3313d6504 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -283,6 +283,18 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
     operator = linalg.LinearOperatorCirculant(
         lin_op_spectrum, input_output_dtype=dtype)
 
+    self.assertEqual(
+        operator.parameters,
+        {
+            "input_output_dtype": dtype,
+            "is_non_singular": None,
+            "is_positive_definite": None,
+            "is_self_adjoint": None,
+            "is_square": True,
+            "name": "LinearOperatorCirculant",
+            "spectrum": lin_op_spectrum,
+        })
+
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
@@ -526,6 +538,20 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
         is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
         input_output_dtype=dtype)
 
+    self.assertEqual(
+        operator.parameters,
+        {
+            "input_output_dtype": dtype,
+            "is_non_singular": None,
+            "is_positive_definite": (
+                True if ensure_self_adjoint_and_pd else None),
+            "is_self_adjoint": (
+                True if ensure_self_adjoint_and_pd else None),
+            "is_square": True,
+            "name": "LinearOperatorCirculant2D",
+            "spectrum": lin_op_spectrum,
+        })
+
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
     return operator, mat
@@ -570,6 +596,19 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
     operator = linalg.LinearOperatorCirculant2D(
         lin_op_spectrum, input_output_dtype=dtype)
 
+    self.assertEqual(
+        operator.parameters,
+        {
+            "input_output_dtype": dtype,
+            "is_non_singular": None,
+            "is_positive_definite": None,
+            "is_self_adjoint": None,
+            "is_square": True,
+            "name": "LinearOperatorCirculant2D",
+            "spectrum": lin_op_spectrum,
+        }
+    )
+
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
     return operator, mat
@@ -675,6 +714,18 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       operator = linalg.LinearOperatorCirculant3D(spectrum)
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), operator.shape)
 
+      self.assertEqual(
+          operator.parameters,
+          {
+              "input_output_dtype": dtypes.complex64,
+              "is_non_singular": None,
+              "is_positive_definite": None,
+              "is_self_adjoint": None,
+              "is_square": True,
+              "name": "LinearOperatorCirculant3D",
+              "spectrum": spectrum,
+          })
+
       matrix_tensor = operator.to_dense()
       self.assertEqual(matrix_tensor.dtype, dtypes.complex64)
       matrix_h = linalg.adjoint(matrix_tensor)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 475cac212ce..0100eb4934b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -43,6 +43,14 @@ class LinearOperatorShape(linalg.LinearOperator):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=None):
+    parameters = dict(
+        shape=shape,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square
+    )
+
     self._stored_shape = shape
     super(LinearOperatorShape, self).__init__(
         dtype=dtypes.float32,
@@ -50,7 +58,8 @@ class LinearOperatorShape(linalg.LinearOperator):
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
-        is_square=is_square)
+        is_square=is_square,
+        parameters=parameters)
 
   def _shape(self):
     return tensor_shape.TensorShape(self._stored_shape)
@@ -71,13 +80,22 @@ class LinearOperatorMatmulSolve(linalg.LinearOperator):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=None):
+    parameters = dict(
+        matrix=matrix,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square
+    )
+
     self._matrix = ops.convert_to_tensor(matrix, name="matrix")
     super(LinearOperatorMatmulSolve, self).__init__(
         dtype=self._matrix.dtype,
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
-        is_square=is_square)
+        is_square=is_square,
+        parameters=parameters)
 
   def _shape(self):
     return self._matrix.shape
@@ -109,6 +127,14 @@ class LinearOperatorTest(test.TestCase):
     self.assertAllEqual((1, 2), operator.batch_shape)
     self.assertAllEqual(4, operator.domain_dimension)
     self.assertAllEqual(3, operator.range_dimension)
+    expected_parameters = {
+        "is_non_singular": None,
+        "is_positive_definite": None,
+        "is_self_adjoint": None,
+        "is_square": None,
+        "shape": (1, 2, 3, 4),
+    }
+    self.assertEqual(expected_parameters, operator.parameters)
 
   def test_all_shape_methods_defined_by_the_one_method_shape(self):
     with self.cached_session():
@@ -131,6 +157,19 @@ class LinearOperatorTest(test.TestCase):
     self.assertTrue(operator.is_self_adjoint)
     self.assertFalse(operator.is_positive_definite)
 
+  def test_nontrivial_parameters(self):
+    matrix = rng.randn(2, 3, 4)
+    matrix_ph = array_ops.placeholder_with_default(input=matrix, shape=None)
+    operator = LinearOperatorMatmulSolve(matrix_ph)
+    expected_parameters = {
+        "is_non_singular": None,
+        "is_positive_definite": None,
+        "is_self_adjoint": None,
+        "is_square": None,
+        "matrix": matrix_ph,
+    }
+    self.assertEqual(expected_parameters, operator.parameters)
+
   def test_generic_to_dense_method_non_square_matrix_static(self):
     matrix = rng.randn(2, 3, 4)
     operator = LinearOperatorMatmulSolve(matrix)
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index c361f79fb1f..135e4406c82 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -336,8 +336,6 @@ class RandomUniformTest(RandomOpTestCommon):
       self.assertLess(error.max(), 5 * std)
 
   # Check that minval = maxval is fine iff we're producing no numbers
-  @test_util.disable_tfrt(
-      "TFE_TensorHandleToNumpy not implemented yet. b/156191611")
   def testUniformIntsDegenerate(self):
     for dt in dtypes.int32, dtypes.int64:
       def sample(n):
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index beedf6ef1f1..9a927b86d0b 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -1006,7 +1006,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
         var = variable_scope.get_variable("x", shape=[1, 1],
                                           dtype=dtypes.float32)
         with self.assertRaisesRegex(ValueError,
-                                    "Shapes.*and.*are incompatible"):
+                                    "shape.*and.*are incompatible"):
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
diff --git a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
index e4a97167c8b..df397d449c3 100644
--- a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
+++ b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import test
 class TemplateMirroredStrategyTest(test.TestCase):
 
   @test_util.run_deprecated_v1
+  @test_util.disable_tfrt("Strategy not supported yet.")
   def test_merge_call(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 4d0f6507aef..5440319db5b 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -1565,6 +1566,9 @@ class TensorArrayTest(test.TestCase):
     ta_grad = ta.grad("grad")
     flows = [ta.flow, ta_grad.flow]
 
+    # Same goes for stack.
+    flows.append(ta.stack("stack"))
+
     # Similar tests for unpack and split
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=3)
@@ -1580,6 +1584,25 @@ class TensorArrayTest(test.TestCase):
       ta = ta.split([1.0, 2.0], [1, 1])
     flows.append(ta.flow)
 
+    g = ops.get_default_graph()
+    dev_assignments = collections.defaultdict(list)
+    for op in g.get_operations():
+      dev_assignments[op.device].append(op.name)
+    # We have created 3 different TensorArray handles, only those and their
+    # "size" ops should be deviceless.
+    self.assertLen(dev_assignments[""], 6)
+    # We assigned two writes explicitly to device #2.
+    ops_assigned_to_task_2 = []
+    for device, ops_on_device in dev_assignments.items():
+      if "/task:2/" in device:
+        ops_assigned_to_task_2 = ops_on_device
+        break
+    self.assertLen(ops_assigned_to_task_2, 2)
+    # All other ops should colocate with the first write on device #1.
+    self.assertLen(dev_assignments, 3)
+    for device in dev_assignments:
+      self.assertNotIn("/task:0/", device)
+
     session = session_lib.Session(self._workers[0].target)
 
     run_options = config_pb2.RunOptions(
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index d81f9c23d97..0d3bbb5144d 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -170,7 +170,7 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
   def testAssignDifferentShapesEagerNotAllowed(self):
     with context.eager_mode():
       var = variables.Variable(np.zeros(shape=[1, 1]))
-      with self.assertRaisesRegex(ValueError, "Shapes.*and.*are incompatible"):
+      with self.assertRaisesRegex(ValueError, "shape.*and.*are incompatible"):
         var.assign(np.zeros(shape=[2, 2]))
 
   @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index de2e8e3cc8d..4b9c1fed916 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -1830,6 +1830,18 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       return grad_out
     self.assertAllEqual(F(), 8.0)
 
+  def testIndexedSlicesInIncomingGrads(self):
+    @def_function.function
+    def F():
+      x = constant_op.constant([2.])
+      # Computes x^4
+      ret = while_loop_v2(
+          lambda _: True, lambda v: v * v, [x], return_same_structure=False,
+          maximum_iterations=2)
+      v = array_ops.gather(ret, [0])
+      return gradients_impl.gradients(v, [x])[0]  # 4*x^3
+    self.assertAllEqual(self.evaluate(F()), [32.])
+
 
 def ScalarShape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 7be05c03e36..03fbea39748 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -470,7 +470,7 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
 }
 
 Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
-                       Safe_TF_TensorPtr* ret, bool convert_string) {
+                       Safe_TF_TensorPtr* ret) {
   DCHECK(ret != nullptr);
 
   // Make sure we dereference this array object in case of error, etc.
@@ -501,7 +501,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
     if (ctx) {
       *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
           static_cast<tensorflow::DataType>(dtype), {}, 0, PyArray_DATA(array),
-          size, convert_string, &DelayedNumpyDecref, array)});
+          size, &DelayedNumpyDecref, array)});
     } else {
       *ret = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array), size,
                                     &DelayedNumpyDecref, array));
@@ -513,8 +513,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
     if (ctx) {
       *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
           static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
-          PyArray_DATA(array), size, convert_string, &DelayedNumpyDecref,
-          array)});
+          PyArray_DATA(array), size, &DelayedNumpyDecref, array)});
     } else {
       *ret = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
                                     PyArray_DATA(array), size,
@@ -528,7 +527,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
     if (ctx) {
       *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
           static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
-          encoded, size, convert_string,
+          encoded, size,
           [](void* data, size_t len, void* arg) {
             delete[] reinterpret_cast<tensorflow::tstring*>(data);
           },
@@ -551,8 +550,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status);
 
 Status NdarrayToTensor(PyObject* obj, Tensor* ret) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor,
-                             false /*convert_string*/);
+  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor);
   if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index 38c098417d5..e7657778fa8 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -36,7 +36,7 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray);
 // Expected to be removed once tstring migration is done.
 ABSL_MUST_USE_RESULT
 Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
-                       Safe_TF_TensorPtr* ret, bool convert_string);
+                       Safe_TF_TensorPtr* ret);
 
 // Creates a tensor in 'ret' from the input Ndarray.
 // TODO(kkb): This is an old conversion function that does not support TFRT.
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 0139355c6b7..9acb6d4a283 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -686,8 +686,7 @@ typedef Converter<bool> BoolConverter;
 // other.
 TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor,
-                                              true /*convert_string*/);
+  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor);
 
   if (TF_PREDICT_FALSE(!status.ok())) {
     PyErr_SetString(PyExc_ValueError,
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 5576ce5e538..6da542ff98e 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -314,6 +314,30 @@ def _StridedSliceGradGrad(op, grad):
       shrink_axis_mask=op.get_attr("shrink_axis_mask"))
 
 
+@ops.RegisterGradient("TensorStridedSliceUpdate")
+def _TensorStridedSliceUpdateGrad(op, grad):  # pylint:disable=missing-function-docstring
+  begin = op.inputs[1]
+  end = op.inputs[2]
+  strides = op.inputs[3]
+  begin_mask = op.get_attr("begin_mask")
+  end_mask = op.get_attr("end_mask")
+  ellipsis_mask = op.get_attr("ellipsis_mask")
+  new_axis_mask = op.get_attr("new_axis_mask")
+  shrink_axis_mask = op.get_attr("shrink_axis_mask")
+  def Apply(f, *args):
+    return f(*args,
+             begin_mask=begin_mask,
+             end_mask=end_mask,
+             shrink_axis_mask=shrink_axis_mask,
+             new_axis_mask=new_axis_mask,
+             ellipsis_mask=ellipsis_mask)
+  dy = Apply(array_ops.strided_slice,
+             grad, begin, end, strides)
+  dx = Apply(array_ops.tensor_strided_slice_update,
+             grad, begin, end, strides, array_ops.zeros_like(dy))
+  return dx, None, None, None, dy
+
+
 @ops.RegisterGradient("Split")
 def _SplitGrad(op, *grads):
   return None, array_ops.concat(list(grads), op.inputs[0])
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 5d68deb7ac1..4a2d04d8a45 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -4489,6 +4489,23 @@ def where_v2(condition, x=None, y=None, name=None):
   <tf.Tensor: shape=(4,), dtype=int32, numpy=array([100, 100, 100, 100],
   dtype=int32)>
 
+  Note that if the gradient of either branch of the tf.where generates
+  a NaN, then the gradient of the entire tf.where will be NaN.
+  A workaround is to use an inner tf.where to ensure the function has
+  no asymptote, and to avoid computing a value whose gradient is NaN by
+  replacing dangerous inputs with safe inputs.
+
+  Instead of this,
+
+  >>> y = tf.constant(-1, dtype=tf.float32)
+  >>> tf.where(y > 0, tf.sqrt(y), y)
+  <tf.Tensor: shape=(), dtype=float32, numpy=-1.0>
+
+  Use this
+
+  >>> tf.where(y > 0, tf.sqrt(tf.where(y > 0, y, 1)), y)
+  <tf.Tensor: shape=(), dtype=float32, numpy=-1.0>
+
   Args:
     condition: A `tf.Tensor` of type `bool`
     x: If provided, a Tensor which is of the same type as `y`, and has a shape
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 17a5d5e97fa..163f0fb7077 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,7 +25,6 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import auto_control_deps_utils as acd
@@ -1120,10 +1119,7 @@ def _build_case(branch_index,
         op for op in bg.get_operations() if auto_control_deps.op_is_stateful(op)
     ])
 
-  # TODO(b/161915509): Remove this after 08/20/2020. This is required to abide
-  # by 3-week forward compat window of new TF python op generating code with
-  # stale runtime binaries.
-  if (stateful_ops or not compat.forward_compatible(2020, 8, 20)):
+  if stateful_ops:
     op_fn = gen_functional_ops.case
   else:
     op_fn = gen_functional_ops.stateless_case
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 39177defe57..38d3461bc0b 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -20,12 +20,10 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
@@ -194,13 +192,10 @@ def confusion_matrix(labels,
     indices = array_ops.stack([labels, predictions], axis=1)
     values = (array_ops.ones_like(predictions, dtype)
               if weights is None else weights)
-    cm_sparse = sparse_tensor.SparseTensor(
+    return array_ops.scatter_nd(
         indices=indices,
-        values=values,
-        dense_shape=math_ops.cast(shape, dtypes.int64))
-    zero_matrix = array_ops.zeros(math_ops.cast(shape, dtypes.int32), dtype)
-
-    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+        updates=values,
+        shape=math_ops.cast(shape, dtypes.int64))
 
 
 @tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index ce7c8252c04..3da536c967e 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -54,59 +54,60 @@ class ResizeNearestNeighborOpTest(test.TestCase):
         resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
-  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
-    out_shape = [1, 4, 6, 1]
+    out_shape = (1, 4, 6, 1)
 
     for nptype in self.TYPES:
       x = np.arange(0, 6).reshape(in_shape).astype(nptype)
 
+      def resize_nn(t, shape=out_shape):
+        return image_ops.resize_nearest_neighbor(t, shape[1:3])
+
       with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
-        resize_out = image_ops.resize_nearest_neighbor(input_tensor,
-                                                       out_shape[1:3])
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-      self.assertLess(err, 1e-3)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
+        self.assertLess(err, 1e-3)
 
-  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
+    out_shape = (1, 2, 3, 1)
 
     for nptype in self.TYPES:
       x = np.arange(0, 24).reshape(in_shape).astype(nptype)
 
+      def resize_nn(t, shape=out_shape):
+        return image_ops.resize_nearest_neighbor(t, shape[1:3])
+
       with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
-        resize_out = image_ops.resize_nearest_neighbor(input_tensor,
-                                                       out_shape[1:3])
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-      self.assertLess(err, 1e-3)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
+        self.assertLess(err, 1e-3)
 
-  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [1, 4, 6, 3]
-    out_shape = [1, 8, 16, 3]
+    out_shape = (1, 8, 16, 3)
 
     for nptype in self.TYPES:
       x = np.arange(0, np.prod(in_shape)).reshape(in_shape).astype(nptype)
       for align_corners in [True, False]:
+
+        def resize_nn(t, shape=out_shape, align_corners=align_corners):
+          return image_ops.resize_nearest_neighbor(
+              t, shape[1:3], align_corners=align_corners)
+
         with self.cached_session(use_gpu=False):
           input_tensor = constant_op.constant(x, shape=in_shape)
-          resize_out = image_ops.resize_nearest_neighbor(
-              input_tensor, out_shape[1:3], align_corners=align_corners)
-          grad_cpu = gradient_checker.compute_gradient(
-              input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+          grad_cpu = gradient_checker_v2.compute_gradient(resize_nn,
+                                                          [input_tensor])
 
         with self.cached_session(use_gpu=True):
           input_tensor = constant_op.constant(x, shape=in_shape)
-          resize_out = image_ops.resize_nearest_neighbor(
-              input_tensor, out_shape[1:3], align_corners=align_corners)
-          grad_gpu = gradient_checker.compute_gradient(
-              input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+          grad_gpu = gradient_checker_v2.compute_gradient(resize_nn,
+                                                          [input_tensor])
+
         self.assertAllClose(grad_cpu, grad_gpu, rtol=1e-5, atol=1e-5)
 
 
@@ -183,20 +184,21 @@ class ResizeBilinearOpTest(test.TestCase):
     out_shape = [1, 2, 3, 1]
     x = np.arange(0, 24).reshape(in_shape)
 
-    with self.cached_session() as sess:
-      for dtype in [np.float16, np.float32, np.float64]:
-        input_tensor = constant_op.constant(x.astype(dtype), shape=in_shape)
-        resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-        grad = sess.run(gradients_impl.gradients(resize_out, input_tensor))[0]
-        self.assertAllEqual(in_shape, grad.shape)
-        # Not using gradient_checker.compute_gradient as I didn't work out
-        # the changes required to compensate for the lower precision of
-        # float16 when computing the numeric jacobian.
-        # Instead, we just test the theoretical jacobian.
-        self.assertAllEqual([[[[1.], [0.], [1.], [0.], [1.], [0.]], [[0.], [
-            0.
-        ], [0.], [0.], [0.], [0.]], [[1.], [0.], [1.], [0.], [1.], [0.]],
-                              [[0.], [0.], [0.], [0.], [0.], [0.]]]], grad)
+    for use_gpu in [False, True]:
+      with self.cached_session(use_gpu=use_gpu) as sess:
+        for dtype in [np.float16, np.float32, np.float64]:
+          input_tensor = constant_op.constant(x.astype(dtype), shape=in_shape)
+          resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
+          grad = sess.run(gradients_impl.gradients(resize_out, input_tensor))[0]
+          self.assertAllEqual(in_shape, grad.shape)
+          # Not using gradient_checker.compute_gradient as I didn't work out
+          # the changes required to compensate for the lower precision of
+          # float16 when computing the numeric jacobian.
+          # Instead, we just test the theoretical jacobian.
+          self.assertAllEqual([[[[1.], [0.], [1.], [0.], [1.], [0.]],
+                                [[0.], [0.], [0.], [0.], [0.], [0.]],
+                                [[1.], [0.], [1.], [0.], [1.], [0.]],
+                                [[0.], [0.], [0.], [0.], [0.], [0.]]]], grad)
 
 
 class ResizeBicubicOpTest(test.TestCase):
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 210b6c6e65d..751a8a00758 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -249,36 +249,36 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegex(ValueError, err_msg):
         image_ops.grayscale_to_rgb(x_tf)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    # Shape inference works and produces expected output where possible
-    rgb_shape = [7, None, 19, 3]
-    gray_shape = rgb_shape[:-1] + [1]
-    with self.cached_session(use_gpu=True):
-      rgb_tf = array_ops.placeholder(dtypes.uint8, shape=rgb_shape)
-      gray = image_ops.rgb_to_grayscale(rgb_tf)
-      self.assertEqual(gray_shape, gray.get_shape().as_list())
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      # Shape inference works and produces expected output where possible
+      rgb_shape = [7, None, 19, 3]
+      gray_shape = rgb_shape[:-1] + [1]
+      with self.cached_session(use_gpu=True):
+        rgb_tf = array_ops.placeholder(dtypes.uint8, shape=rgb_shape)
+        gray = image_ops.rgb_to_grayscale(rgb_tf)
+        self.assertEqual(gray_shape, gray.get_shape().as_list())
 
-    with self.cached_session(use_gpu=True):
-      gray_tf = array_ops.placeholder(dtypes.uint8, shape=gray_shape)
-      rgb = image_ops.grayscale_to_rgb(gray_tf)
-      self.assertEqual(rgb_shape, rgb.get_shape().as_list())
+      with self.cached_session(use_gpu=True):
+        gray_tf = array_ops.placeholder(dtypes.uint8, shape=gray_shape)
+        rgb = image_ops.grayscale_to_rgb(gray_tf)
+        self.assertEqual(rgb_shape, rgb.get_shape().as_list())
 
-    # Shape inference does not break for unknown shapes
-    with self.cached_session(use_gpu=True):
-      rgb_tf_unknown = array_ops.placeholder(dtypes.uint8)
-      gray_unknown = image_ops.rgb_to_grayscale(rgb_tf_unknown)
-      self.assertFalse(gray_unknown.get_shape())
+      # Shape inference does not break for unknown shapes
+      with self.cached_session(use_gpu=True):
+        rgb_tf_unknown = array_ops.placeholder(dtypes.uint8)
+        gray_unknown = image_ops.rgb_to_grayscale(rgb_tf_unknown)
+        self.assertFalse(gray_unknown.get_shape())
 
-    with self.cached_session(use_gpu=True):
-      gray_tf_unknown = array_ops.placeholder(dtypes.uint8)
-      rgb_unknown = image_ops.grayscale_to_rgb(gray_tf_unknown)
-      self.assertFalse(rgb_unknown.get_shape())
+      with self.cached_session(use_gpu=True):
+        gray_tf_unknown = array_ops.placeholder(dtypes.uint8)
+        rgb_unknown = image_ops.grayscale_to_rgb(gray_tf_unknown)
+        self.assertFalse(rgb_unknown.get_shape())
 
 
 class AdjustGamma(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_float32(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -288,10 +288,10 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegex(ValueError, err_msg):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_uint8(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -301,10 +301,10 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
 
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegex(ValueError, err_msg):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         image_ops.adjust_gamma(x, gamma=-1)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -314,10 +314,10 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = constant_op.constant(-1.0, dtype=dtypes.float32)
 
-      image = image_ops.adjust_gamma(x, gamma=y)
-
       err_msg = "Gamma should be a non-negative real number"
-      with self.assertRaisesRegex(errors.InvalidArgumentError, err_msg):
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
+        image = image_ops.adjust_gamma(x, gamma=y)
         self.evaluate(image)
 
   def _test_adjust_gamma_uint8(self, gamma):
@@ -329,7 +329,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x_np = np.random.uniform(0, 255, (8, 8)).astype(np.uint8)
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=gamma)
-      y_tf = np.trunc(y.eval())
+      y_tf = np.trunc(self.evaluate(y))
 
       # calculate gamma correction using numpy
       # firstly, transform uint8 to float representation
@@ -349,22 +349,19 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x_np = np.random.uniform(0, 1.0, (8, 8))
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=gamma)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       y_np = np.clip(np.power(x_np, gamma), 0, 1.0)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_one_float32(self):
     """Same image should be returned for gamma equal to one"""
     self._test_adjust_gamma_float32(1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_one_uint8(self):
     self._test_adjust_gamma_uint8(1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_zero_uint8(self):
     """White image should be returned for gamma equal
 
@@ -372,7 +369,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_uint8(gamma=0.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one_uint8(self):
     """Verifying the output with expected results for gamma
 
@@ -380,7 +376,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_uint8(gamma=0.5)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one_uint8(self):
     """Verifying the output with expected results for gamma
 
@@ -388,7 +383,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_uint8(gamma=1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one_float32(self):
     """Verifying the output with expected results for gamma
 
@@ -396,7 +390,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_float32(0.5)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one_float32(self):
     """Verifying the output with expected results for gamma
 
@@ -404,7 +397,6 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     """
     self._test_adjust_gamma_float32(1.0)
 
-  @test_util.run_deprecated_v1
   def test_adjust_gamma_zero_float32(self):
     """White image should be returned for gamma equal
 
@@ -1572,12 +1564,12 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
       y_tf = self._adjustContrastTf(x_np, contrast_factor)
       self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testContrastFactorShape(self):
     x_shape = [1, 2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    with self.assertRaisesRegex(ValueError,
+    with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
+                                "contrast_factor must be scalar|"
                                 "Shape must be rank 0 but is rank 1"):
       image_ops.adjust_contrast(x_np, [2.0])
 
@@ -1645,7 +1637,6 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     y /= stddev
     return y
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     x_shape = [13, 9, 3]
     x_np = np.arange(0, np.prod(x_shape), dtype=np.float32).reshape(x_shape)
@@ -1654,7 +1645,6 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
-      self.assertTrue(y.op.name.startswith("per_image_standardization"))
       y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1881,7 +1871,6 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     else:
       self.assertEqual(y.get_shape().as_list(), post_shape)
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shapes = [[13, 9, 3], [5, 13, 9, 3]]
     for x_shape in x_shapes:
@@ -1892,7 +1881,6 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           y = image_ops.central_crop(x, 1.0)
           y_tf = self.evaluate(y)
           self.assertAllEqual(y_tf, x_np)
-          self.assertEqual(y.op.name, x.op.name)
 
   def testCropping(self):
     x_shape = [4, 8, 1]
@@ -1925,7 +1913,6 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
-  @test_util.run_deprecated_v1
   def testCropping2(self):
     # Test case for 10315
     x_shapes = [[240, 320, 3], [5, 240, 320, 3]]
@@ -1936,51 +1923,50 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       y_np = np.zeros(y_shape, dtype=np.int32)
       for use_gpu in [True, False]:
         with self.cached_session(use_gpu=use_gpu):
-          x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
-          y = image_ops.central_crop(x, 0.33)
-          y_tf = y.eval(feed_dict={x: x_np})
+          y_tf = self.evaluate(image_ops.central_crop(x_np, 0.33))
           self.assertAllEqual(y_tf, y_np)
           self.assertAllEqual(y_tf.shape, y_np.shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    # Test no-op fraction=1.0, with 3-D tensors.
-    self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
-    self._assertShapeInference([None, 60, 3], 1.0, [None, 60, 3])
-    self._assertShapeInference([50, None, 3], 1.0, [50, None, 3])
-    self._assertShapeInference([None, None, 3], 1.0, [None, None, 3])
-    self._assertShapeInference([50, 60, None], 1.0, [50, 60, None])
-    self._assertShapeInference([None, None, None], 1.0, [None, None, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      # Test no-op fraction=1.0, with 3-D tensors.
+      self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
+      self._assertShapeInference([None, 60, 3], 1.0, [None, 60, 3])
+      self._assertShapeInference([50, None, 3], 1.0, [50, None, 3])
+      self._assertShapeInference([None, None, 3], 1.0, [None, None, 3])
+      self._assertShapeInference([50, 60, None], 1.0, [50, 60, None])
+      self._assertShapeInference([None, None, None], 1.0, [None, None, None])
 
-    # Test no-op fraction=0.5, with 3-D tensors.
-    self._assertShapeInference([50, 60, 3], 0.5, [26, 30, 3])
-    self._assertShapeInference([None, 60, 3], 0.5, [None, 30, 3])
-    self._assertShapeInference([50, None, 3], 0.5, [26, None, 3])
-    self._assertShapeInference([None, None, 3], 0.5, [None, None, 3])
-    self._assertShapeInference([50, 60, None], 0.5, [26, 30, None])
-    self._assertShapeInference([None, None, None], 0.5, [None, None, None])
+      # Test no-op fraction=0.5, with 3-D tensors.
+      self._assertShapeInference([50, 60, 3], 0.5, [26, 30, 3])
+      self._assertShapeInference([None, 60, 3], 0.5, [None, 30, 3])
+      self._assertShapeInference([50, None, 3], 0.5, [26, None, 3])
+      self._assertShapeInference([None, None, 3], 0.5, [None, None, 3])
+      self._assertShapeInference([50, 60, None], 0.5, [26, 30, None])
+      self._assertShapeInference([None, None, None], 0.5, [None, None, None])
 
-    # Test no-op fraction=1.0, with 4-D tensors.
-    self._assertShapeInference([5, 50, 60, 3], 1.0, [5, 50, 60, 3])
-    self._assertShapeInference([5, None, 60, 3], 1.0, [5, None, 60, 3])
-    self._assertShapeInference([5, 50, None, 3], 1.0, [5, 50, None, 3])
-    self._assertShapeInference([5, None, None, 3], 1.0, [5, None, None, 3])
-    self._assertShapeInference([5, 50, 60, None], 1.0, [5, 50, 60, None])
-    self._assertShapeInference([5, None, None, None], 1.0,
-                               [5, None, None, None])
-    self._assertShapeInference([None, None, None, None], 1.0,
-                               [None, None, None, None])
+      # Test no-op fraction=1.0, with 4-D tensors.
+      self._assertShapeInference([5, 50, 60, 3], 1.0, [5, 50, 60, 3])
+      self._assertShapeInference([5, None, 60, 3], 1.0, [5, None, 60, 3])
+      self._assertShapeInference([5, 50, None, 3], 1.0, [5, 50, None, 3])
+      self._assertShapeInference([5, None, None, 3], 1.0, [5, None, None, 3])
+      self._assertShapeInference([5, 50, 60, None], 1.0, [5, 50, 60, None])
+      self._assertShapeInference([5, None, None, None], 1.0,
+                                 [5, None, None, None])
+      self._assertShapeInference([None, None, None, None], 1.0,
+                                 [None, None, None, None])
 
-    # Test no-op fraction=0.5, with 4-D tensors.
-    self._assertShapeInference([5, 50, 60, 3], 0.5, [5, 26, 30, 3])
-    self._assertShapeInference([5, None, 60, 3], 0.5, [5, None, 30, 3])
-    self._assertShapeInference([5, 50, None, 3], 0.5, [5, 26, None, 3])
-    self._assertShapeInference([5, None, None, 3], 0.5, [5, None, None, 3])
-    self._assertShapeInference([5, 50, 60, None], 0.5, [5, 26, 30, None])
-    self._assertShapeInference([5, None, None, None], 0.5,
-                               [5, None, None, None])
-    self._assertShapeInference([None, None, None, None], 0.5,
-                               [None, None, None, None])
+      # Test no-op fraction=0.5, with 4-D tensors.
+      self._assertShapeInference([5, 50, 60, 3], 0.5, [5, 26, 30, 3])
+      self._assertShapeInference([5, None, 60, 3], 0.5, [5, None, 30, 3])
+      self._assertShapeInference([5, 50, None, 3], 0.5, [5, 26, None, 3])
+      self._assertShapeInference([5, None, None, 3], 0.5, [5, None, None, 3])
+      self._assertShapeInference([5, 50, 60, None], 0.5, [5, 26, 30, None])
+      self._assertShapeInference([5, None, None, None], 0.5,
+                                 [5, None, None, None])
+      self._assertShapeInference([None, None, None, None], 0.5,
+                                 [None, None, None, None])
 
   def testErrorOnInvalidCentralCropFractionValues(self):
     x_shape = [13, 9, 3]
@@ -2003,14 +1989,15 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    x_shape = [13, 9, 3]
-    x_np = np.ones(x_shape, dtype=np.float32)
-    for use_gpu in [True, False]:
-      with self.cached_session(use_gpu=use_gpu):
-        y = image_ops.central_crop(x_np, 1.0)
-        self.assertTrue(y.op.name.startswith("central_crop"))
+    # Testing name scope requires a graph.
+    with ops.Graph().as_default():
+      x_shape = [13, 9, 3]
+      x_np = np.ones(x_shape, dtype=np.float32)
+      for use_gpu in [True, False]:
+        with self.cached_session(use_gpu=use_gpu):
+          y = image_ops.central_crop(x_np, 1.0)
+          self.assertTrue(y.op.name.startswith("central_crop"))
 
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index ae8bfbdbdd0..4ea7ef007d6 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -203,6 +203,8 @@ class InitializersTest(test.TestCase):
             run_metadata=run_metadata)
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('b/165614506: Incorrect device name set in '
+                          'tfrt::TensorHandle.')
   def test_eager_orthogonal_gpu(self):
     with context.eager_mode():
       v = variable_scope.get_variable(
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
index 37b66d59c09..d06ffa4cc68 100644
--- a/tensorflow/python/ops/init_ops_v2_test.py
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -162,8 +162,7 @@ class RandomUniformInitializerTest(InitializersTest):
 
   @test_util.run_in_graph_and_eager_modes
   def testRangeInitializer(self):
-    self.skipTest("b/161580897")
-    shape = (9, 6, 7)
+    shape = (20, 6, 7)
     self._range_test(
         init_ops_v2.RandomUniform(minval=-1, maxval=1, seed=124),
         shape,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 8035a9901e6..2c1b5889720 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -234,7 +234,7 @@ def _matrix_exp_pade13(matrix):
 def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the matrix exponential of one or more square matrices.
 
-  exp(A) = \sum_{n=0}^\infty A^n/n!
+  $$exp(A) = \sum_{n=0}^\infty A^n/n!$$
 
   The exponential is computed using a combination of the scaling and squaring
   method and the Pade approximation. Details can be found in:
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index cf14cdb6eae..08974f83ffb 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -146,6 +146,27 @@ class LinearOperator(module.Module):
   * If `is_X == False`, callers should expect the operator to not have `X`.
   * If `is_X == None` (the default), callers should have no expectation either
     way.
+
+  #### Initialization parameters
+
+  All subclasses of `LinearOperator` are expected to pass a `parameters`
+  argument to `super().__init__()`.  This should be a `dict` containing
+  the unadulterated arguments passed to the subclass `__init__`.  For example,
+  `MyLinearOperator` with an initializer should look like:
+
+  ```python
+  def __init__(self, operator, is_square=False, name=None):
+     parameters = dict(
+         operator=operator,
+         is_square=is_square,
+         name=name
+     )
+     ...
+     super().__init__(..., parameters=parameters)
+   ```
+
+   Users can then access `my_linear_operator.parameters` to see all arguments
+   passed to its initializer.
   """
 
   # TODO(b/143910018) Remove graph_parents in V3.
@@ -158,7 +179,8 @@ class LinearOperator(module.Module):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=None,
-               name=None):
+               name=None,
+               parameters=None):
     r"""Initialize the `LinearOperator`.
 
     **This is a private method for subclass use.**
@@ -179,6 +201,8 @@ class LinearOperator(module.Module):
         https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
+      parameters: Python `dict` of parameters used to instantiate this
+        `LinearOperator`.
 
     Raises:
       ValueError:  If any member of graph_parents is `None` or not a `Tensor`.
@@ -210,6 +234,8 @@ class LinearOperator(module.Module):
     self._is_non_singular = is_non_singular
     self._is_self_adjoint = is_self_adjoint
     self._is_positive_definite = is_positive_definite
+    self._parameters = self._no_dependency(parameters)
+    self._parameters_sanitized = False
     self._name = name or type(self).__name__
 
   @contextlib.contextmanager
@@ -221,6 +247,11 @@ class LinearOperator(module.Module):
     with ops.name_scope(full_name) as scope:
       yield scope
 
+  @property
+  def parameters(self):
+    """Dictionary of parameters used to instantiate this `LinearOperator`."""
+    return dict(self._parameters)
+
   @property
   def dtype(self):
     """The `DType` of `Tensor`s handled by this `LinearOperator`."""
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
index 57c65647330..1af0ce9a008 100644
--- a/tensorflow/python/ops/linalg/linear_operator_adjoint.py
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -112,6 +112,14 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `operator.is_non_singular` is False.
     """
+    parameters = dict(
+        operator=operator,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name,
+    )
 
     self._operator = operator
 
@@ -150,6 +158,7 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(operator.graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 7afa15ae069..514b023ba82 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -163,6 +163,15 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       TypeError:  If all operators do not have the same `dtype`.
       ValueError:  If `operators` is empty or are non-square.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     operators = list(operators)
@@ -224,6 +233,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=True,
+          parameters=parameters,
           name=name)
 
     # TODO(b/143910018) Remove graph_parents in V3.
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
index 84f2ff15345..43107c092e3 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
@@ -231,6 +231,15 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       ValueError:  If `operators` is empty, contains an erroneous number of
         elements, or contains operators with incompatible shapes.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     for row in operators:
@@ -256,6 +265,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
   def _validate_num_operators(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index d4b671c53bd..31dd5b2967a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -63,6 +63,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
                is_self_adjoint=None,
                is_positive_definite=None,
                is_square=True,
+               parameters=None,
                name="LinearOperatorCirculant"):
     r"""Initialize an `_BaseLinearOperatorCirculant`.
 
@@ -83,6 +84,8 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
+      parameters: Python `dict` of parameters used to instantiate this
+        `LinearOperator`.
       name:  A name to prepend to all ops created by this class.
 
     Raises:
@@ -121,6 +124,7 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self.spectrum])
@@ -744,6 +748,15 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name:  A name to prepend to all ops created by this class.
     """
+    parameters = dict(
+        spectrum=spectrum,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     super(LinearOperatorCirculant, self).__init__(
         spectrum,
         block_depth=1,
@@ -752,6 +765,7 @@ class LinearOperatorCirculant(_BaseLinearOperatorCirculant):
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
         is_square=is_square,
+        parameters=parameters,
         name=name)
 
   def _eigvals(self):
@@ -924,6 +938,15 @@ class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name:  A name to prepend to all ops created by this class.
     """
+    parameters = dict(
+        spectrum=spectrum,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     super(LinearOperatorCirculant2D, self).__init__(
         spectrum,
         block_depth=2,
@@ -932,6 +955,7 @@ class LinearOperatorCirculant2D(_BaseLinearOperatorCirculant):
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
         is_square=is_square,
+        parameters=parameters,
         name=name)
 
 
@@ -1074,6 +1098,15 @@ class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name:  A name to prepend to all ops created by this class.
     """
+    parameters = dict(
+        spectrum=spectrum,
+        input_output_dtype=input_output_dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     super(LinearOperatorCirculant3D, self).__init__(
         spectrum,
         block_depth=3,
@@ -1082,6 +1115,7 @@ class LinearOperatorCirculant3D(_BaseLinearOperatorCirculant):
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
         is_square=is_square,
+        parameters=parameters,
         name=name)
 
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 00ef86d5aba..ace7e85ddf6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -143,6 +143,14 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
       TypeError:  If all operators do not have the same `dtype`.
       ValueError:  If `operators` is empty.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name)
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     operators = list(operators)
@@ -182,6 +190,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index b5e81b267ce..3f298bce341 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -139,6 +139,14 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
       TypeError:  If `diag.dtype` is not an allowed type.
       ValueError:  If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
     """
+    parameters = dict(
+        diag=diag,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[diag]):
       self._diag = linear_operator_util.convert_nonref_to_tensor(
@@ -163,6 +171,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self._diag])
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index b10822589d5..a616a8c09fe 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -133,6 +133,14 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
     """
+    parameters = dict(
+        matrix=matrix,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[matrix]):
       self._matrix = linear_operator_util.convert_nonref_to_tensor(
@@ -146,6 +154,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self._matrix])
diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py
index 265c862ea03..cbb7a88a9ed 100644
--- a/tensorflow/python/ops/linalg/linear_operator_householder.py
+++ b/tensorflow/python/ops/linalg/linear_operator_householder.py
@@ -123,6 +123,14 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
       ValueError:  `is_self_adjoint` is not `True`, `is_positive_definite` is
         not `False` or `is_square` is not `True`.
     """
+    parameters = dict(
+        reflection_axis=reflection_axis,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[reflection_axis]):
       self._reflection_axis = linear_operator_util.convert_nonref_to_tensor(
@@ -152,6 +160,7 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       # TODO(b/143910018) Remove graph_parents in V3.
       self._set_graph_parents([self._reflection_axis])
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index a0f7ead42d6..8d5d2c8a52a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -252,6 +252,17 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         `{is_self_adjoint, is_non_singular, is_positive_definite}`.
       TypeError:  If `num_rows` or `batch_shape` is ref-type (e.g. Variable).
     """
+    parameters = dict(
+        num_rows=num_rows,
+        batch_shape=batch_shape,
+        dtype=dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        assert_proper_shapes=assert_proper_shapes,
+        name=name)
+
     dtype = dtype or dtypes.float32
     self._assert_proper_shapes = assert_proper_shapes
 
@@ -272,6 +283,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
       linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
@@ -596,6 +608,16 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
       ValueError:  If `num_rows` is determined statically to be non-scalar, or
         negative.
     """
+    parameters = dict(
+        num_rows=num_rows,
+        multiplier=multiplier,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        assert_proper_shapes=assert_proper_shapes,
+        name=name)
+
     self._assert_proper_shapes = assert_proper_shapes
 
     with ops.name_scope(name, values=[multiplier, num_rows]):
@@ -620,6 +642,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
       self._num_rows = linear_operator_util.shape_tensor(
diff --git a/tensorflow/python/ops/linalg/linear_operator_inversion.py b/tensorflow/python/ops/linalg/linear_operator_inversion.py
index d6527e7c6d5..b2784c4d1e5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_inversion.py
+++ b/tensorflow/python/ops/linalg/linear_operator_inversion.py
@@ -113,6 +113,14 @@ class LinearOperatorInversion(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `operator.is_non_singular` is False.
     """
+    parameters = dict(
+        operator=operator,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     self._operator = operator
 
@@ -163,6 +171,7 @@ class LinearOperatorInversion(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(operator.graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 1fe68885bfe..b351bc5c507 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -167,6 +167,15 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
       TypeError:  If all operators do not have the same `dtype`.
       ValueError:  If `operators` is empty.
     """
+    parameters = dict(
+        operators=operators,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
+
     # Validate operators.
     check_ops.assert_proper_iterable(operators)
     operators = list(operators)
@@ -226,6 +235,7 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
     # TODO(b/143910018) Remove graph_parents in V3.
     self._set_graph_parents(graph_parents)
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index c141bb19f35..2f12c71b48a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -182,6 +182,18 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `is_X` flags are set in an inconsistent way.
     """
+    parameters = dict(
+        base_operator=base_operator,
+        u=u,
+        diag_update=diag_update,
+        v=v,
+        is_diag_update_positive=is_diag_update_positive,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
     dtype = base_operator.dtype
 
     if diag_update is not None:
@@ -253,6 +265,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       self._set_graph_parents(graph_parents)
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index a4120102663..fbc1f531083 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -137,6 +137,14 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     Raises:
       ValueError:  If `is_square` is `False`.
     """
+    parameters = dict(
+        tril=tril,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     if is_square is False:
       raise ValueError(
@@ -155,6 +163,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
       self._set_graph_parents([self._tril])
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_permutation.py b/tensorflow/python/ops/linalg/linear_operator_permutation.py
index 9cc8e158a21..7f15941c473 100644
--- a/tensorflow/python/ops/linalg/linear_operator_permutation.py
+++ b/tensorflow/python/ops/linalg/linear_operator_permutation.py
@@ -140,6 +140,15 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
       ValueError:  `is_self_adjoint` is not `True`, `is_positive_definite` is
         not `False` or `is_square` is not `True`.
     """
+    parameters = dict(
+        perm=perm,
+        dtype=dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[perm]):
       self._perm = linear_operator_util.convert_nonref_to_tensor(
@@ -160,6 +169,7 @@ class LinearOperatorPermutation(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
   def _check_perm(self, perm):
diff --git a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
index 2d61a536e29..95546c25118 100644
--- a/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
+++ b/tensorflow/python/ops/linalg/linear_operator_toeplitz.py
@@ -138,6 +138,15 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
     """
+    parameters = dict(
+        col=col,
+        row=row,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[row, col]):
       self._row = linear_operator_util.convert_nonref_to_tensor(row, name="row")
@@ -155,7 +164,9 @@ class LinearOperatorToeplitz(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
+
       self._set_graph_parents([self._row, self._col])
 
   def _check_row_col(self, row, col):
diff --git a/tensorflow/python/ops/linalg/linear_operator_tridiag.py b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
index 2ba310f75bf..b8c4027cc76 100644
--- a/tensorflow/python/ops/linalg/linear_operator_tridiag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_tridiag.py
@@ -171,6 +171,15 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
       TypeError:  If `diag.dtype` is not an allowed type.
       ValueError:  If `diag.dtype` is real, and `is_self_adjoint` is not `True`.
     """
+    parameters = dict(
+        diagonals=diagonals,
+        diagonals_format=diagonals_format,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        name=name
+    )
 
     with ops.name_scope(name, values=[diagonals]):
       if diagonals_format not in _DIAGONAL_FORMATS:
@@ -193,6 +202,7 @@ class LinearOperatorTridiag(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
   def _shape(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_zeros.py b/tensorflow/python/ops/linalg/linear_operator_zeros.py
index 7382ef51218..eded9bb713f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_zeros.py
+++ b/tensorflow/python/ops/linalg/linear_operator_zeros.py
@@ -176,6 +176,19 @@ class LinearOperatorZeros(linear_operator.LinearOperator):
       ValueError:  If any of the following is not `True`:
         `{is_self_adjoint, is_non_singular, is_positive_definite}`.
     """
+    parameters = dict(
+        num_rows=num_rows,
+        num_columns=num_columns,
+        batch_shape=batch_shape,
+        dtype=dtype,
+        is_non_singular=is_non_singular,
+        is_self_adjoint=is_self_adjoint,
+        is_positive_definite=is_positive_definite,
+        is_square=is_square,
+        assert_proper_shapes=assert_proper_shapes,
+        name=name
+    )
+
     dtype = dtype or dtypes.float32
     self._assert_proper_shapes = assert_proper_shapes
 
@@ -194,6 +207,7 @@ class LinearOperatorZeros(linear_operator.LinearOperator):
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
           is_square=is_square,
+          parameters=parameters,
           name=name)
 
       linear_operator_util.assert_not_ref_type(num_rows, "num_rows")
diff --git a/tensorflow/python/ops/map_ops.py b/tensorflow/python/ops/map_ops.py
index 5a18e1d5564..2b194089c26 100644
--- a/tensorflow/python/ops/map_ops.py
+++ b/tensorflow/python/ops/map_ops.py
@@ -61,9 +61,10 @@ def LookupGrad(op, dval):
 def InsertGrad(op, dmap):
   _, k, v = op.inputs
   key_grad = None
-  (value_grad, map_grad) = control_flow_ops.cond(tensor_map_has_key(dmap, k),
-                                                 lambda: (tensor_map_lookup(dmap, k, v.dtype), tensor_map_erase(dmap, k, v.dtype)),
-                                                 lambda: (array_ops.zeros_like(v), dmap))
+  (value_grad, map_grad) = control_flow_ops.cond(
+      tensor_map_has_key(dmap, k), lambda:
+      (tensor_map_lookup(dmap, k, v.dtype), tensor_map_erase(dmap, k, v.dtype)),
+      lambda: (array_ops.zeros_like(v), dmap))
   return map_grad, key_grad, value_grad
 
 @ops.RegisterGradient("TensorMapErase")
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4f6bafc096a..6dee2fac95d 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -1142,3 +1142,48 @@ def _NthElementGrad(op, grad):
   num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)
 
   return [math_ops.divide(indicators, num_selected) * grad, None]
+
+
+def _MeanAggregator(inputs, segments):
+  """Replaces each segment with its mean along the last axis.
+
+  Specifically, each value in the `inputs` tensor gets replaced by the mean
+  value computed from the values that belong to the same segment.
+
+  Args:
+   inputs: A 2-tensor. Aggregation is done over dimension 1.
+   segments: A 2-tensor, same shape as `input`.
+
+  Returns:
+    The result, same shape and type as `inputs`.
+  """
+  result = []
+  for inputs_i, segments_i in zip(
+      array_ops.split(inputs, inputs.shape[0]),
+      array_ops.split(segments, segments.shape[0])):
+    # Note that we do not use tf.math.segment_mean, as it has no TPU support.
+    means_i = math_ops.unsorted_segment_mean(
+        inputs_i, segments_i, num_segments=math_ops.reduce_max(segments_i) + 1)
+    result.append(
+        array_ops.reshape(array_ops.gather(means_i, segments_i), [-1]))
+  return array_ops.stack(result, axis=0)
+
+
+# We have to register the gradients for these ops so that tensorflow will know
+# how to differentiate them.
+@ops.RegisterGradient("IsotonicRegression")
+def _IsotonicRegressionGrad(op, grad_output, grad_segments):
+  """Gradient for the isotonic regression function.
+
+  Args:
+    op: The IsotonicRegression tensorflow op.
+    grad_output: Tensor of incoming gradients with respect to the output.
+    grad_segments: Tensor of incoming gradients with respect to the segments.
+
+  Returns:
+    A tensor, same size as `grad_output` with the gradient with respect to
+    the input.
+  """
+  del grad_segments  # Discrete, non-differentiable.
+  segments = op.outputs[1]
+  return _MeanAggregator(grad_output, segments)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index ff55ca32e8d..7874d6e4d59 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -3566,46 +3566,49 @@ def _flatten_outer_dims(logits):
   return output
 
 
-def _softmax(logits, compute_op, dim=-1, name=None):
-  """Helper function for softmax and log_softmax.
+def _wrap_2d_function(inputs, compute_op, dim=-1, name=None):
+  """Helper function for ops that accept and return 2d inputs of same shape.
 
-  It reshapes and transposes the input logits into a 2-D Tensor and then invokes
-  the tf.nn._softmax or tf.nn._log_softmax function. The output would be
-  transposed and reshaped back.
+  It reshapes and transposes the inputs into a 2-D Tensor and then invokes
+  the given function. The output would be transposed and reshaped back.
+  If the given function returns a tuple of tensors, each of them will be
+  transposed and reshaped.
 
   Args:
-    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+    inputs: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    compute_op: Either gen_nn_ops.softmax or gen_nn_ops.log_softmax
+    compute_op: The function to wrap. Must accept the input tensor as its first
+      arugment, and a second keyword argument `name`.
     dim: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
+    A `Tensor`. Has the same shape as inputs. If compute_op returns multiple
+      tensors, each of them have the same shape as the input.
   Raises:
-    InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
-      dimension of `logits`.
+    InvalidArgumentError: if `inputs` is empty or `dim` is beyond the last
+      dimension of `inputs`.
   """
 
-  def _swap_axis(logits, dim_index, last_index, name=None):
+  def _swap_axis(input_tensor, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
     return array_ops.transpose(
-        logits,
+        input_tensor,
         array_ops.concat([
             math_ops.range(dim_index), [last_index],
             math_ops.range(dim_index + 1, last_index), [dim_index]
         ], 0),
         name=name)
 
-  logits = ops.convert_to_tensor(logits)
+  inputs = ops.convert_to_tensor(inputs)
 
   # We need its original shape for shape inference.
-  shape = logits.get_shape()
+  shape = inputs.get_shape()
   is_last_dim = (dim == -1) or (dim == shape.ndims - 1)
 
   if is_last_dim:
-    return compute_op(logits, name=name)
+    return compute_op(inputs, name=name)
 
   dim_val = dim
   if isinstance(dim, ops.Tensor):
@@ -3618,10 +3621,10 @@ def _softmax(logits, compute_op, dim=-1, name=None):
                                        shape.ndims))
 
   # If dim is not the last dimension, we have to do a transpose so that we can
-  # still perform softmax on its last dimension.
+  # still perform the op on its last dimension.
 
   # In case dim is negative (and is not last dimension -1), add shape.ndims
-  ndims = array_ops.rank(logits)
+  ndims = array_ops.rank(inputs)
   if not isinstance(dim, ops.Tensor):
     if dim < 0:
       dim += ndims
@@ -3629,20 +3632,24 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     dim = array_ops.where(math_ops.less(dim, 0), dim + ndims, dim)
 
   # Swap logits' dimension of dim and its last dimension.
-  input_rank = array_ops.rank(logits)
+  input_rank = array_ops.rank(inputs)
   dim_axis = dim % shape.ndims
-  logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1))
+  inputs = _swap_axis(inputs, dim_axis, math_ops.subtract(input_rank, 1))
 
-  # Do the actual softmax on its last dimension.
-  output = compute_op(logits)
+  # Do the actual call on its last dimension.
+  def fix_output(output):
+    output = _swap_axis(
+        output, dim_axis, math_ops.subtract(input_rank, 1), name=name)
 
-  output = _swap_axis(
-      output, dim_axis, math_ops.subtract(input_rank, 1), name=name)
+    # Make shape inference work since transpose may erase its static shape.
+    output.set_shape(shape)
+    return output
 
-  # Make shape inference work since transpose may erase its static shape.
-  output.set_shape(shape)
-
-  return output
+  outputs = compute_op(inputs)
+  if isinstance(outputs, tuple):
+    return tuple(fix_output(output) for output in outputs)
+  else:
+    return fix_output(outputs)
 
 
 @tf_export(v1=["nn.softmax", "math.softmax"])
@@ -3687,7 +3694,7 @@ def softmax(logits, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.softmax, axis, name)
 
 
 @tf_export("nn.softmax", "math.softmax", v1=[])
@@ -3715,7 +3722,7 @@ def softmax_v2(logits, axis=None, name=None):
   """
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.softmax, axis, name)
 
 
 @tf_export(v1=["nn.log_softmax", "math.log_softmax"])
@@ -3746,7 +3753,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.log_softmax, axis, name)
 
 
 @tf_export("nn.log_softmax", "math.log_softmax", v1=[])
@@ -3774,7 +3781,7 @@ def log_softmax_v2(logits, axis=None, name=None):
   """
   if axis is None:
     axis = -1
-  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+  return _wrap_2d_function(logits, gen_nn_ops.log_softmax, axis, name)
 
 
 def _ensure_xent_args(name, sentinel, labels, logits):
@@ -5674,3 +5681,78 @@ tf_export(v1=["nn.quantized_relu_x"])(
     dispatch.add_dispatch_support(gen_nn_ops.quantized_relu_x))
 tf_export(v1=["nn.quantized_max_pool"])(
     dispatch.add_dispatch_support(gen_nn_ops.quantized_max_pool))
+
+
+@tf_export("nn.isotonic_regression", v1=[])
+@dispatch.add_dispatch_support
+def isotonic_regression(inputs, decreasing=True, axis=-1):
+  r"""Solves isotonic regression problems along the given axis.
+
+  For each vector x, the problem solved is
+
+  $$\argmin_{y_1 >= y_2 >= ... >= y_n} \sum_i (x_i - y_i)^2.$$
+
+  As the solution is component-wise constant, a second tensor is returned that
+  encodes the segments. The problems are solved over the given axis.
+
+  Consider the following example, where we solve a batch of two problems. The
+  first input is [3, 1, 2], while the second [1, 3, 4] (as the axis is 1).
+  >>> x = tf.constant([[3, 1, 2], [1, 3, 4]], dtype=tf.float32)
+  >>> y, segments = tf.nn.isotonic_regression(x, axis=1)
+  >>> y  # The solution.
+  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
+  array([[3.       , 1.5      , 1.5      ],
+         [2.6666667, 2.6666667, 2.6666667]], dtype=float32)>
+
+  Note that the first solution has two blocks [2] and [1.5, 1.5]. The second
+  solution is constant, and thus has a single segment. These segments are
+  exactly what the second returned tensor encodes:
+
+  >>> segments
+  <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
+  array([[0, 1, 1],
+         [0, 0, 0]], dtype=int32)>
+
+
+  Args:
+    inputs: A tensor holding the inputs.
+    decreasing: If set to False, the inequalities in the optimizing constrained
+      are flipped.
+    axis: The axis along which the problems should be solved.
+
+  Returns:
+    output: The solutions, same shape as type as the input.
+    segments: An int32 tensor, same shape as the input indicating the segments
+      that have the same value. Specifically, those positions that have the same
+      value correspond to the same segment. These values start at zero, and are
+      monotonously increasing for each solution.
+  """
+  type_promotions = {
+      # Float types get mapped to themselves, int8/16 to float32, rest to double
+      dtypes.float32:
+          dtypes.float32,
+      dtypes.half:
+          dtypes.half,
+      dtypes.bfloat16:
+          dtypes.bfloat16,
+      dtypes.int8:
+          dtypes.float32,
+      dtypes.int16:
+          dtypes.float32,
+  }
+  inputs = ops.convert_to_tensor(inputs)
+  try:
+    output_dtype = type_promotions[inputs.dtype]
+  except KeyError:
+    output_dtype = dtypes.float64
+
+  def compute_on_matrix(matrix, name=None):
+    iso_fn = functools.partial(
+        gen_nn_ops.isotonic_regression, output_dtype=output_dtype, name=name)
+    if decreasing:
+      return iso_fn(matrix)
+    else:
+      output, segments = iso_fn(-matrix)
+      return -output, segments
+
+  return _wrap_2d_function(inputs, compute_on_matrix, axis)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 3802f92b384..9b864be39a2 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_impl
@@ -1701,5 +1702,88 @@ class RaggedEmbeddingTest(test_lib.TestCase):
           actual)
 
 
+class IsotonicTest(parameterized.TestCase, test_lib.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_increasing_and_decreasing(self):
+    x = constant_op.constant([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]],
+                             dtype=dtypes.float64)
+    y, segments = nn_ops.isotonic_regression(x, decreasing=False)
+    self.assertAllClose(y, x)
+    self.assertAllClose(segments, [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])
+
+    y, segments = nn_ops.isotonic_regression(x, decreasing=True)
+    self.assertAllClose(
+        y,
+        [
+            [2, 2, 2, 2, 2],  # Average of the inputs.
+            [7, 7, 7, 7, 7]
+        ])
+    self.assertAllClose(segments, array_ops.zeros((2, 5)))
+
+    y, segments = nn_ops.isotonic_regression(-x, decreasing=True)
+    self.assertAllClose(segments, [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])
+
+    self.assertAllClose(y, -x)
+    y, segments = nn_ops.isotonic_regression(-x, decreasing=False)
+    self.assertAllClose(
+        -y,
+        [
+            [2, 2, 2, 2, 2],  # Average of the inputs.
+            [7, 7, 7, 7, 7]
+        ])
+    self.assertAllClose(segments, array_ops.zeros((2, 5)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_different_axis(self):
+    x = constant_op.constant([[0, 6, 2, 8, 4], [5, 1, 7, 3, 9]],
+                             dtype=dtypes.float64)
+    y, segments = nn_ops.isotonic_regression(x, decreasing=True, axis=0)
+    self.assertAllClose(
+        y,
+        [
+            [2.5, 6, 4.5, 8, 6.5],  # Either identity or average.
+            [2.5, 1, 4.5, 3, 6.5]
+        ])
+    self.assertAllClose(segments, [[0, 0, 0, 0, 0], [0, 1, 0, 1, 0]])
+
+  @test_util.run_v2_only
+  def testGradientV2(self, dtype=np.float64, batch_size=30, dimensions=50):
+
+    @def_function.function
+    def ComputeIsotonicFn(x):
+      y, _ = nn_ops.isotonic_regression(x)  # No gradient wrt segments.
+      return y
+
+    np.random.seed(0)
+    x_init = np.random.randn(batch_size, dimensions).astype(dtype)
+    grad_theoretical, grad_numerical = gradient_checker_v2.compute_gradient(
+        ComputeIsotonicFn, [x_init], delta=1e-5)
+    self.assertAllClose(grad_theoretical, grad_numerical)
+
+  @test_util.run_v1_only("compute_gradient_error is v1 only")
+  def testGradientV1(self, dtype=np.float64, batch_size=30, dimensions=50):
+    np.random.seed(0)
+    x_init = np.random.randn(batch_size, dimensions).astype(dtype)
+    with self.cached_session():
+      x = array_ops.placeholder(dtype, (batch_size, dimensions))
+      y, _ = nn_ops.isotonic_regression(x)  # Segments have no gradient.
+      max_error = gradient_checker.compute_gradient_error(
+          x, (batch_size, dimensions), y, (batch_size, dimensions), x_init)
+    self.assertAllClose(max_error, 0.)
+
+  @parameterized.parameters([[dtypes.half, dtypes.half],
+                             [dtypes.bfloat16, dtypes.bfloat16],
+                             [dtypes.float32, dtypes.float32],
+                             [dtypes.float64, dtypes.float64],
+                             [dtypes.int32, dtypes.float64],
+                             [dtypes.int16, dtypes.float32]])
+  def testTypePromotion(self, dtype_in, expected_dtype_out):
+    x = constant_op.constant([[0, 6, 2, 8, 4], [5, 1, 7, 3, 9]], dtype=dtype_in)
+    y, segments = nn_ops.isotonic_regression(x)
+    self.assertEqual(y.dtype, expected_dtype_out)
+    self.assertEqual(segments.dtype, dtypes.int32)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index 633b74b4a78..f50f1934643 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -128,7 +128,7 @@ during runtime. Some differences are:
     may need to change to explicit shape operations or control flow
     constructs.
 *   Also note the [autograph limitations](
-https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/limitations.md).
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md).
 
 
 ## Mutation and Variables
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
new file mode 100644
index 00000000000..11e968d1576
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
@@ -0,0 +1,318 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "TWLmmaQpX-i1"
+      },
+      "source": [
+        "# TensorFlow NumPy: Keras and Distribution Strategy"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fmGBjt1arUk7"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "TensorFlow Numpy provides an implementation of a subset of NumPy API on top of TensorFlow backend. Please see [TF NumPy API documentation](https://www.tensorflow.org/api_docs/python/tf/experimental/numpy) and \n",
+        " [TensorFlow NumPy Guide](https://colab.sandbox.google.com/drive/15AshdHLS_xTMohWDleTiAgyPdRt6JQJJ#scrollTo=s2enCDi_FvCR).\n",
+        "\n",
+        "This document shows how TensorFlow NumPy interoperates with TensorFlow's high level APIs like DistributionStrategky and Keras."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eAf_CAIerkPZ"
+      },
+      "source": [
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "OG0u3eVdSOAk"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install --quiet --upgrade tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "YjQUVUd3X325"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "import tensorflow.experimental.numpy as tnp\n",
+        "\n",
+        "# Creates 3 logical GPU devices for demonstrating distribution.\n",
+        "gpu_device = tf.config.list_physical_devices(\"GPU\")[0]\n",
+        "tf.config.set_logical_device_configuration(\n",
+        "    gpu_device, [tf.config.LogicalDeviceConfiguration(128)] * 3)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UTZPYMaPr_oU"
+      },
+      "source": [
+        "## TF NumPy and Keras\n",
+        "\n",
+        "TF NumPy can be used to create custom Keras layers. These layers interoperate with and behave like regular Keras layers. Here are some things to note to understand how these layers work.\n",
+        "\n",
+        "- Existing Keras layers can be invoked with ND Array inputs, in addition to other input types like `tf.Tensor`, `np.ndarray`, python literals, etc. All these types will be internally convert to a `tf.Tensor` before the layer's `call` method is invoked\n",
+        "- Existing Keras layers will continue to output `tf.Tensor` values. Custom layers could output ND Array or `tf.Tensor`. \n",
+        "- Custom and existing Keras layers should be freely composable.\n",
+        "\n",
+        "Checkout the examples below that demonstrate the above.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "gsZLC4eEsm8P"
+      },
+      "source": [
+        "### ND Array inputs\n",
+        "\n",
+        "Create and call an existing Keras layers with ND Array inputs. Note that the layer outputs a `tf.Tensor`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "CTiylo_UrxW7"
+      },
+      "outputs": [],
+      "source": [
+        "dense_layer = tf.keras.layers.Dense(5)\n",
+        "inputs = tnp.random.randn(2, 3).astype(tnp.float32)\n",
+        "outputs = dense_layer(inputs)\n",
+        "print(\"Shape:\", outputs.shape)\n",
+        "print(\"Class:\", outputs.__class__)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "vltJnASzXJNq"
+      },
+      "source": [
+        "### Custom Keras Layer\n",
+        "\n",
+        "Create a new Keras layer as below using TensorFlow NumPy methods.  Note that the layer's call method receives a `tf.tensor` value as input. It can convert to `ndarray` using `tnp.asarray`. However this conversion may not be needed since TF NumPy APIs can handle `tf.Tensor` inputs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "0i7lOWJwsVMy"
+      },
+      "outputs": [],
+      "source": [
+        "class ProjectionLayer(tf.keras.layers.Layer):\n",
+        "  \"\"\"Linear projection layer using TF NumPy.\"\"\"\n",
+        "\n",
+        "  def __init__(self, units):\n",
+        "    super(ProjectionLayer, self).__init__()\n",
+        "    self._units = units\n",
+        "\n",
+        "  def build(self, input_shape):\n",
+        "    stddev = tnp.sqrt(self._units).astype(tnp.float32)\n",
+        "    initial_value = tnp.random.randn(input_shape[1], self._units).astype(\n",
+        "        tnp.float32) / stddev\n",
+        "    # Note that TF NumPy can interoperate with tf.Variable.\n",
+        "    self.w = tf.Variable(initial_value, trainable=True)\n",
+        "\n",
+        "  def call(self, inputs):\n",
+        "    return tnp.matmul(inputs, self.w)\n",
+        "\n",
+        "# Call with ndarray inputs\n",
+        "layer = ProjectionLayer(2)\n",
+        "tnp_inputs = tnp.random.randn(2, 4).astype(tnp.float32)\n",
+        "print(\"output:\", layer(tnp_inputs))\n",
+        "\n",
+        "# Call with tf.Tensor inputs\n",
+        "tf_inputs = tf.random.uniform([2, 4])\n",
+        "print(\"\\noutput: \", layer(tf_inputs))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UExEbq1EENLB"
+      },
+      "source": [
+        "### Composing layers\n",
+        "\n",
+        "Next create a Keras model by composing the `ProjectionLayer` defined above with a `Dense` layer."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "qbTkqFgDDXaw"
+      },
+      "outputs": [],
+      "source": [
+        "batch_size = 3\n",
+        "units = 5\n",
+        "model = tf.keras.Sequential([tf.keras.layers.Dense(units),\n",
+        "                             ProjectionLayer(2)])\n",
+        "\n",
+        "print(\"Calling with ND Array inputs\")\n",
+        "tnp_inputs = tnp.random.randn(batch_size, units).astype(tnp.float32)\n",
+        "output = model.call(tnp_inputs)\n",
+        "print(\"Output shape %s.\\nOutput class: %s\\n\" % (output.shape, output.__class__))\n",
+        "\n",
+        "print(\"Calling with tensor inputs\")\n",
+        "tf_inputs = tf.convert_to_tensor(tnp_inputs)\n",
+        "output = model.call(tf_inputs)\n",
+        "print(\"Output shape %s.\\nOutput class: %s\" % (output.shape, output.__class__))\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QeooMJZdYbXq"
+      },
+      "source": [
+        "## Distributed Strategy: tf.distribution\n",
+        "\n",
+        "[TensorFlow NumPy Guide](https://colab.sandbox.google.com/drive/15AshdHLS_xTMohWDleTiAgyPdRt6JQJJ#scrollTo=s2enCDi_FvCR) shows how `tf.device` API can be used to place individual operations on specific devices. Note that this works for remote devices as well.\n",
+        "\n",
+        "\n",
+        "TensorFlow also has higher level distribution APIs that make it easy to replicate computation across devices. \n",
+        "Here we will show how to place TensorFlow NumPy code in a Distribution Strategy context to easily perform replicated computation.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "tOTNvkTxZ-ok"
+      },
+      "outputs": [],
+      "source": [
+        "# Initialize the strategy\n",
+        "gpus = tf.config.list_logical_devices(\"GPU\")\n",
+        "print(\"Using following GPUs\", gpus)\n",
+        "\n",
+        "strategy = tf.distribute.MirroredStrategy(gpus)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Zlmeo8i7Euq0"
+      },
+      "source": [
+        "### Simple replication example\n",
+        "\n",
+        "First try running a simple NumPy function in `strategy` context."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "u3ZLh3_ZB8mk"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function\n",
+        "def replica_fn():\n",
+        "  replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group\n",
+        "  print(\"Running on device %s\" % replica_id.device)\n",
+        "  return tnp.asarray(replica_id) * 5\n",
+        "\n",
+        "print(strategy.run(replica_fn).values)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UyyZBpLyE9LG"
+      },
+      "source": [
+        "### Replicated model execution\n",
+        "\n",
+        "Next run the model defined earlier under `strategy` scope."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "6VeBFzTCCbZk"
+      },
+      "outputs": [],
+      "source": [
+        "# Test running the model in a distributed setting.\n",
+        "model = tf.keras.Sequential([tf.keras.layers.Dense(units), ProjectionLayer(2)])\n",
+        "\n",
+        "@tf.function\n",
+        "def model_replica_fn():\n",
+        "  inputs = tnp.random.randn(batch_size, units).astype(tnp.float32)\n",
+        "  return model.call(inputs)\n",
+        "\n",
+        "print(\"Outputs:\\n\", strategy.run(model_replica_fn).values)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "TensorFlow NumPy: Keras and Distribution Strategy",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
new file mode 100644
index 00000000000..a7cd7f38b41
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
@@ -0,0 +1,541 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "KQALG9h23b0R"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "both",
+        "colab": {},
+        "colab_type": "code",
+        "id": "U34SJW0W3dg_"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "VIX1XZHJ3gFo"
+      },
+      "source": [
+        "# TensorFlow NumPy: Distributed Image Classification Tutorial"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "f7NApJ7R3ndN"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "TensorFlow implements a subset of the [NumPy API](https://numpy.org/doc/1.16), available as `tf.experimental.numpy`. This allows running NumPy code, accelerated by TensorFlow together with access to all of TensorFlow's APIs. Please see [TensorFlow NumPy Guide](https://www.tensorflow.org/guide/tf_numpy) to get started.\n",
+        "\n",
+        "Here you will learn how to build a deep model for an image classification task by using TensorFlow Numpy APIs. For using higher level `tf.keras` APIs, see the following [tutorial](tutorials/quickstart/beginner)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "IYDdfih63rSG"
+      },
+      "source": [
+        "## Setup\n",
+        "\n",
+        "tf.experimental.numpy will be available in the stable branch starting from TensorFlow 2.4. For now, it is available in  nightly."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "3IlLM-YlTMv5"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install --quiet --upgrade tf-nightly\n",
+        "!pip install --quiet --upgrade tensorflow-datasets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "U13hRXHKTcsE"
+      },
+      "outputs": [],
+      "source": [
+        "import collections\n",
+        "import functools\n",
+        "import matplotlib.pyplot as plt\n",
+        "import os\n",
+        "import tensorflow as tf\n",
+        "import tensorflow.experimental.numpy as tnp\n",
+        "import tensorflow_datasets as tfds\n",
+        "\n",
+        "gpus = tf.config.list_physical_devices('GPU')\n",
+        "if gpus:\n",
+        "  tf.config.set_logical_device_configuration(gpus[0], [\n",
+        "      tf.config.LogicalDeviceConfiguration(memory_limit=128),\n",
+        "      tf.config.LogicalDeviceConfiguration(memory_limit=128)])\n",
+        "  devices = tf.config.list_logical_devices('GPU')\n",
+        "else:\n",
+        "  cpus = tf.config.list_physical_devices('CPU')\n",
+        "  tf.config.set_logical_device_configuration(cpus[0], [\n",
+        "      tf.config.LogicalDeviceConfiguration(),\n",
+        "      tf.config.LogicalDeviceConfiguration()])\n",
+        "  devices = tf.config.list_logical_devices('CPU')\n",
+        "\n",
+        "print(\"Using following virtual devices\", devices)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AxNuZSqZKcdM"
+      },
+      "source": [
+        "## Mnist dataset\n",
+        "\n",
+        "Mnist contains 28 * 28 images of digits from 0 to 9. The task is to classify the images as these 10 possible classes.\n",
+        "\n",
+        "Below, load the dataset and examine a few samples."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "yKf9Tm5OjwGK"
+      },
+      "outputs": [],
+      "source": [
+        "NUM_CLASSES = 10\n",
+        "BATCH_SIZE = 64\n",
+        "INPUT_SIZE = 28 * 28\n",
+        "\n",
+        "def process_data(data_dict):\n",
+        "  images = tnp.asarray(data_dict['image']) / 255.0\n",
+        "  images = images.reshape(-1, INPUT_SIZE).astype(tnp.float32)\n",
+        "  labels = tnp.asarray(data_dict['label'])\n",
+        "  labels = tnp.eye(NUM_CLASSES, dtype=tnp.float32)[labels]\n",
+        "  return images, labels\n",
+        "\n",
+        "with tf.device(\"CPU:0\"):\n",
+        "  train_dataset = tfds.load('mnist', split='train', shuffle_files=True, \n",
+        "                            batch_size=BATCH_SIZE).map(process_data)\n",
+        "  test_dataset = tfds.load('mnist', split='test', shuffle_files=True, \n",
+        "                          batch_size=-1)\n",
+        "  x_test, y_test = process_data(test_dataset)\n",
+        "\n",
+        "  # Plots some examples.\n",
+        "  images, labels = next(iter(train_dataset.take(1)))\n",
+        "  _, axes = plt.subplots(1, 8, figsize=(12, 96))\n",
+        "  for i, ax in enumerate(axes):\n",
+        "    ax.imshow(images[i].reshape(28, 28), cmap='gray')\n",
+        "    ax.axis(\"off\")\n",
+        "    ax.set_title(\"Label: %d\" % int(tnp.argmax(labels[i])))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ZDJQp4i00qaJ"
+      },
+      "source": [
+        "## Define layers and model\n",
+        "\n",
+        "Here, you will implement a multi-layer perceptron model that trains on the MNIST data. First, define a `Dense` class which applies a linear transform followed by a \"relu\" non-linearity."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "44yzAmBFreyg"
+      },
+      "outputs": [],
+      "source": [
+        "class Dense(object):\n",
+        "\n",
+        "  def __init__(self, units, use_relu=True):\n",
+        "    self.wt = None\n",
+        "    self.bias = None\n",
+        "    self._use_relu = use_relu\n",
+        "    self._built = False\n",
+        "    self._units = units\n",
+        "\n",
+        "  def __call__(self, inputs):\n",
+        "    if not self._built:\n",
+        "      self._build(inputs.shape)\n",
+        "    x = tnp.add(tnp.matmul(inputs, self.wt), self.bias)\n",
+        "    if self._use_relu:\n",
+        "      return tnp.maximum(x, 0.)\n",
+        "    else:\n",
+        "      return x\n",
+        "\n",
+        "  @property\n",
+        "  def params(self):\n",
+        "    assert self._built\n",
+        "    return [self.wt, self.bias]\n",
+        "\n",
+        "  def _build(self, input_shape):\n",
+        "    size = input_shape[1]\n",
+        "    stddev = 1 / tnp.sqrt(size)\n",
+        "    # Note that model parameters are `tf.Variable` since they requires\n",
+        "    # mutation, which is currently unsupported by TensorFlow NumPy.\n",
+        "    # Also note interoperation with TensorFlow APIs below.\n",
+        "    self.wt = tf.Variable(\n",
+        "        tf.random.truncated_normal(\n",
+        "            [size, self._units], stddev=stddev, dtype=tf.float32))\n",
+        "    self.bias = tf.Variable(tf.zeros([self._units], dtype=tf.float32))\n",
+        "    self._built = True"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wfKpg3adUCy9"
+      },
+      "source": [
+        "Next, create a `Model` object that applies two non-linear `Dense` transforms,\n",
+        "followed by a linear transform."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "NdrdxKB7SenC"
+      },
+      "outputs": [],
+      "source": [
+        "class Model(object):\n",
+        "  \"\"\"A  three layer neural network.\"\"\"\n",
+        "\n",
+        "  def __init__(self):\n",
+        "    self.layer1 = Dense(128)\n",
+        "    self.layer2 = Dense(32)\n",
+        "    self.layer3 = Dense(NUM_CLASSES, use_relu=False)\n",
+        "\n",
+        "  def __call__(self, inputs):\n",
+        "    x = self.layer1(inputs)\n",
+        "    x = self.layer2(x)\n",
+        "    return self.layer3(x)\n",
+        "\n",
+        "  @property\n",
+        "  def params(self):\n",
+        "    return self.layer1.params + self.layer2.params + self.layer3.params"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Hoxh5Z7E_9Pv"
+      },
+      "source": [
+        "## Training and evaluation\n",
+        "\n",
+        "Checkout the following methods for performing training and evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "hOxqjE7rZPdr"
+      },
+      "outputs": [],
+      "source": [
+        "def forward(model, inputs, labels):\n",
+        "  \"\"\"Computes prediction and loss.\"\"\"\n",
+        "  logits = model(inputs)\n",
+        "  # TensorFlow's loss function has numerically stable implementation of forward\n",
+        "  # pass and gradients. So we prefer that here.\n",
+        "  loss = tf.nn.softmax_cross_entropy_with_logits(labels, logits)\n",
+        "  mean_loss = tnp.mean(loss)\n",
+        "  return logits, mean_loss\n",
+        "\n",
+        "def compute_gradients(model, inputs, labels):\n",
+        "  \"\"\"Computes gradients of loss based on `labels` and prediction on `inputs`.\"\"\"\n",
+        "  with tf.GradientTape() as tape:\n",
+        "    tape.watch(inputs)\n",
+        "    _, loss = forward(model, inputs, labels)\n",
+        "  gradients = tape.gradient(loss, model.params)\n",
+        "  return gradients\n",
+        "\n",
+        "def compute_sgd_updates(gradients, learning_rate):\n",
+        "  \"\"\"Computes parameter updates based on SGD update rule.\"\"\"\n",
+        "  return [-learning_rate * grad for grad in gradients]\n",
+        "\n",
+        "def apply_updates(model, updates):\n",
+        "  \"\"\"Applies `update` to `model.params`.\"\"\"\n",
+        "  for param, update in zip(model.params, updates):\n",
+        "    param.assign_add(update)\n",
+        "\n",
+        "def evaluate(model, images, labels):\n",
+        "  \"\"\"Evaluates accuracy for `model`'s predictions.\"\"\"\n",
+        "  prediction = model(images)\n",
+        "  predicted_class = tnp.argmax(prediction, axis=-1)\n",
+        "  actual_class = tnp.argmax(labels, axis=-1)\n",
+        "  return float(tnp.mean(predicted_class == actual_class))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "8t70b5d6XCs7"
+      },
+      "source": [
+        "### Single GPU training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "HrhS_M6kALeP"
+      },
+      "outputs": [],
+      "source": [
+        "NUM_EPOCHS = 10\n",
+        "\n",
+        "@tf.function\n",
+        "def train_step(model, input, labels, learning_rate):\n",
+        "  gradients = compute_gradients(model, input, labels)\n",
+        "  updates = compute_sgd_updates(gradients, learning_rate)\n",
+        "  apply_updates(model, updates)\n",
+        "\n",
+        "# Creates and build a model.\n",
+        "model = Model()\n",
+        "\n",
+        "accuracies = []\n",
+        "for _ in range(NUM_EPOCHS):\n",
+        "  for inputs, labels in train_dataset:\n",
+        "    train_step(model, inputs, labels, learning_rate=0.1)\n",
+        "  accuracies.append(evaluate(model, x_test, y_test))\n",
+        "\n",
+        "def plot_accuracies(accuracies):\n",
+        "  plt.plot(accuracies)\n",
+        "  plt.xlabel(\"epoch\")\n",
+        "  plt.ylabel(\"accuracy\")\n",
+        "  plt.title(\"Eval accuracy vs epoch\")\n",
+        "\n",
+        "plot_accuracies(accuracies)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ak_hCOkGXXfl"
+      },
+      "source": [
+        "### Multi GPU runs\n",
+        "\n",
+        "Next, run mirrored training on multiple GPUs. Note that the GPUs used here are virtual and map to the same physical GPU.\n",
+        "\n",
+        "First, define a few utilities to run replicated computation and reductions."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ujbeT5p6Xm7k"
+      },
+      "source": [
+        "#### Distribution primitives\n",
+        "\n",
+        "Checkout primitives below for function replication and distributed reduction."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "MZ6hivj-ZIRo"
+      },
+      "outputs": [],
+      "source": [
+        "import threading\n",
+        "import queue\n",
+        "\n",
+        "# Note that this code currently relies on dispatching operations from python\n",
+        "# threads.\n",
+        "class ReplicatedFunction(object):\n",
+        "  \"\"\"Creates a callable that will run `fn` on each device in `devices`.\"\"\"\n",
+        "\n",
+        "  def __init__(self, fn, devices, **kw_args):\n",
+        "    self._shutdown = False\n",
+        "\n",
+        "    def _replica_fn(device, input_queue, output_queue):\n",
+        "      while not self._shutdown:\n",
+        "        inputs = input_queue.get()\n",
+        "        with tf.device(device):\n",
+        "          output_queue.put(fn(*inputs, **kw_args))\n",
+        "\n",
+        "    self.threads = []\n",
+        "    self.input_queues = [queue.Queue() for _ in devices]\n",
+        "    self.output_queues = [queue.Queue() for _ in devices]\n",
+        "    for i, device in enumerate(devices):\n",
+        "      thread = threading.Thread(\n",
+        "          target=_replica_fn,\n",
+        "          args=(device, self.input_queues[i], self.output_queues[i]))\n",
+        "      thread.start()\n",
+        "      self.threads.append(thread)\n",
+        "\n",
+        "  def __call__(self, *inputs):\n",
+        "    all_inputs = zip(*inputs)\n",
+        "    for input_queue, replica_input, in zip(self.input_queues, all_inputs):\n",
+        "      input_queue.put(replica_input)\n",
+        "    return [q.get() for q in self.output_queues]\n",
+        "\n",
+        "  def __del__(self):\n",
+        "    self._shutdown = True\n",
+        "    for t in self.threads:\n",
+        "      t.join(3)\n",
+        "    self.threads = None\n",
+        "\n",
+        "def collective_mean(inputs, num_devices):\n",
+        "  \"\"\"Performs collective mean reduction on inputs.\"\"\"\n",
+        "  outputs = []\n",
+        "  for instance_key, inp in enumerate(inputs):\n",
+        "    outputs.append(tnp.asarray(\n",
+        "      tf.raw_ops.CollectiveReduce(\n",
+        "          input=inp, group_size=num_devices, group_key=0,\n",
+        "          instance_key=instance_key, merge_op='Add', final_op='Div',\n",
+        "          subdiv_offsets=[])))\n",
+        "  return outputs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1ZiN1rpJYHLu"
+      },
+      "source": [
+        "#### Distributed training "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "A6ZHYmLapunm"
+      },
+      "outputs": [],
+      "source": [
+        "# This is similar to `train_step` except for an extra collective reduction of\n",
+        "# gradients\n",
+        "@tf.function\n",
+        "def replica_step(model, inputs, labels,\n",
+        "                 learning_rate=None, num_devices=None):\n",
+        "  gradients = compute_gradients(model, inputs, labels)\n",
+        "  # Note that each replica performs a reduction to compute mean of gradients.\n",
+        "  reduced_gradients = collective_mean(gradients, num_devices)\n",
+        "  updates = compute_sgd_updates(reduced_gradients, learning_rate)\n",
+        "  apply_updates(model, updates)\n",
+        "\n",
+        "models = [Model() for _ in devices]\n",
+        "\n",
+        "# The code below builds all the model objects and copies model parameters from\n",
+        "# the first model to all the replicas.\n",
+        "def init_model(model):\n",
+        "  model(tnp.zeros((1, INPUT_SIZE), dtype=tnp.float32))\n",
+        "  if model != models[0]:\n",
+        "    # Copy the first models weights into the other models.\n",
+        "    for p1, p2 in zip(model.params, models[0].params):\n",
+        "      p1.assign(p2)\n",
+        "\n",
+        "with tf.device(devices[0]):\n",
+        "  init_model(models[0])\n",
+        "# Replicate and run the parameter initialization.\n",
+        "ReplicatedFunction(init_model, devices[1:])(models[1:])\n",
+        "\n",
+        "# Replicate the training step\n",
+        "replicated_step = ReplicatedFunction(\n",
+        "    replica_step, devices, learning_rate=0.1, num_devices=len(devices))\n",
+        "\n",
+        "accuracies = []\n",
+        "print(\"Running distributed training on devices: %s\" % devices)\n",
+        "for _ in range(NUM_EPOCHS):\n",
+        "  for inputs, labels in train_dataset:\n",
+        "    replicated_step(models,\n",
+        "                    tnp.split(inputs, len(devices)),\n",
+        "                    tnp.split(labels, len(devices)))\n",
+        "  accuracies.append(evaluate(models[0], x_test, y_test))\n",
+        "\n",
+        "plot_accuracies(accuracies)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [
+        "KQALG9h23b0R",
+        "f7NApJ7R3ndN"
+      ],
+      "name": "TensorFlow Numpy: Distributed Image Classification",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 7217bae75e6..866c66a7d14 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -830,6 +830,8 @@ def moveaxis(a, source, destination):  # pylint: disable=missing-docstring
     source = (source,)
   if isinstance(destination, int):
     destination = (destination,)
+  if len(source) != len(destination):
+    raise ValueError('The lengths of source and destination must equal')
 
   a_rank = np_utils._maybe_static(array_ops.rank(a))  # pylint: disable=protected-access
 
@@ -1508,8 +1510,39 @@ def _as_index(idx, need_scalar=True):
   return data, data.shape.rank == 0
 
 
-def _slice_helper(tensor, slice_spec):
-  """Helper function for __getitem__."""
+def _slice_helper(tensor, slice_spec, updates=None):
+  """Helper function for __getitem__ and _with_update.
+
+  This function collects the indices in `slice_spec` into two buckets, which we
+  can call "idx1" and "idx2" here. idx1 is intended for `strided_slice`, idx2
+  `gather`.  They also correspond to "basic indices" and "advanced indices" in
+  numpy.  This function supports both reading and writing at the indices. The
+  reading path can be summarized as `gather(stride_slice(tensor, idx1),
+  idx2)`. The writing path can be summarized as `strided_slice_update(tensor,
+  idx1, scatter(strided_slice(tensor, idx1), idx2, updates))`.  (`gather` here
+  means `tf.gather` or `tf.gather_nd`; `scatter` here means
+  `tf.tensor_scatter_update`.)  The writing path is inefficient because it needs
+  to first read out a portion (probably much larger than `updates`) of `tensor`
+  using `strided_slice`, update it, and then write the portion back. An
+  alternative approach is to only use `scatter`, which amounts to using the
+  indexing mechanism of gather/scatter to implement
+  strided_slice/strided_slice_update. This is feasible for XLA Gather/Scatter
+  because they support spans (e.g. `2:5`) in indices (as begin/end pairs), but
+  not TF gather/scatter because they don't support spans (except those that
+  cover entire dimensions, i.e. `:`).  If we materialize spans into individual
+  indices, the size of the index tensor would explode.  (Note that XLA
+  Gather/Scatter have a similar problem for stride > 1 because they don't
+  support strides.  Indices such as `1:2:8` will need to be materialized into
+  individual indices such as [1, 3, 5, 7].)
+
+  Args:
+    tensor: the tensor to be read from or write into.
+    slice_spec: the indices.
+    updates: the new values to write into `tensor`.
+
+  Returns:
+    The result of reading or the updated `tensor` after writing.
+  """
   begin, end, strides = [], [], []
   new_axis_mask, shrink_axis_mask = 0, 0
   begin_mask, end_mask = 0, 0
@@ -1579,19 +1612,37 @@ def _slice_helper(tensor, slice_spec):
     else:
       var_empty = constant_op.constant([], dtype=dtypes.int32)
       packed_begin = packed_end = packed_strides = var_empty
-    # TODO(agarwal): set_shape on tensor to set rank.
-    tensor = array_ops.strided_slice(
-        tensor,
-        packed_begin,
-        packed_end,
-        packed_strides,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        shrink_axis_mask=shrink_axis_mask,
-        new_axis_mask=new_axis_mask,
-        ellipsis_mask=ellipsis_mask,
-        name=name)
-    if not advanced_indices:
+    if updates is not None and not advanced_indices:
+      return array_ops.tensor_strided_slice_update(
+          tensor,
+          packed_begin,
+          packed_end,
+          packed_strides,
+          updates,
+          begin_mask=begin_mask,
+          end_mask=end_mask,
+          shrink_axis_mask=shrink_axis_mask,
+          new_axis_mask=new_axis_mask,
+          ellipsis_mask=ellipsis_mask,
+          name=name)
+    else:
+      # TODO(b/164251540): Find a better way to support update that does not
+      #   involve one read + two writes.
+      if updates is not None:
+        original_tensor = tensor
+      # TODO(agarwal): set_shape on tensor to set rank.
+      tensor = array_ops.strided_slice(
+          tensor,
+          packed_begin,
+          packed_end,
+          packed_strides,
+          begin_mask=begin_mask,
+          end_mask=end_mask,
+          shrink_axis_mask=shrink_axis_mask,
+          new_axis_mask=new_axis_mask,
+          ellipsis_mask=ellipsis_mask,
+          name=name)
+    if updates is None and not advanced_indices:
       return tensor
     advanced_indices_map = {}
     for index, data, had_ellipsis in advanced_indices:
@@ -1616,14 +1667,49 @@ def _slice_helper(tensor, slice_spec):
     indices = [x.data for x in _promote_dtype(*indices)]
     indices = np_utils.tf_broadcast(*indices)
     stacked_indices = array_ops.stack(indices, axis=-1)
-    if not dims_contiguous:
-      tensor = moveaxis(tensor, dims, range(len(dims))).data
+    # Skip the contiguous-dims optimization for update because there is no
+    # tf.*scatter* op that supports the `axis` argument.
+    if not dims_contiguous or updates is not None:
+      if range(len(dims)) != dims:
+        tensor = moveaxis(tensor, dims, range(len(dims))).data
       tensor_shape_prefix = array_ops.shape(
           tensor, out_type=stacked_indices.dtype)[:len(dims)]
       stacked_indices = array_ops.where_v2(
           stacked_indices < 0, stacked_indices + tensor_shape_prefix,
           stacked_indices)
-      return array_ops.gather_nd(tensor, stacked_indices)
+      if updates is None:
+        return array_ops.gather_nd(tensor, stacked_indices)
+      else:
+        if dims_contiguous:
+          # TODO(wangpeng): Support unknown rank (e.g. by partially flattening
+          #   `updates`)
+          if stacked_indices.shape.rank is None:
+            raise NotImplementedError(
+                'Rank of the advanced indices must currently be known')
+          batch_size = stacked_indices.shape.rank - 1
+          batch_start = dims[0]
+          if batch_start < 0:
+            batch_start += len(dims) - batch_size
+          def range_(start, length):
+            return range(start, start + length)
+          updates = moveaxis(updates, range_(batch_start, batch_size),
+                             range(batch_size)).data
+        tensor = array_ops.tensor_scatter_update(
+            tensor, stacked_indices, updates)
+        if range(len(dims)) != dims:
+          tensor = moveaxis(tensor, range(len(dims)), dims).data
+        return array_ops.tensor_strided_slice_update(
+            original_tensor,
+            packed_begin,
+            packed_end,
+            packed_strides,
+            tensor,
+            begin_mask=begin_mask,
+            end_mask=end_mask,
+            shrink_axis_mask=shrink_axis_mask,
+            new_axis_mask=new_axis_mask,
+            ellipsis_mask=ellipsis_mask,
+            name=name + '_2')
     # Note that gather_nd does not support gathering from inside the array.
     # To avoid shuffling data back and forth, we transform the indices and
     # do a gather instead.
@@ -1683,4 +1769,21 @@ def _getitem(self, slice_spec):
   return np_utils.tensor_to_ndarray(result_t)
 
 
+def _with_update(a, slice_spec, updates):
+  """Implementation of ndarray._with_update."""
+  if (isinstance(slice_spec, bool) or (isinstance(slice_spec, ops.Tensor) and
+                                       slice_spec.dtype == dtypes.bool) or
+      (isinstance(slice_spec, (np.ndarray, np_arrays.ndarray)) and
+       slice_spec.dtype == np.bool)):
+    slice_spec = nonzero(slice_spec)
+
+  if not isinstance(slice_spec, tuple):
+    slice_spec = _as_spec_tuple(slice_spec)
+
+  updates = asarray(updates, a.dtype)
+  result_t = _slice_helper(a.data, slice_spec, updates.data)
+  return np_utils.tensor_to_ndarray(result_t)
+
+
 setattr(np_arrays.ndarray, '__getitem__', _getitem)
+setattr(np_arrays.ndarray, '_with_update', _with_update)
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index 3b52ae5bafc..f7b5f65d72d 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -323,6 +323,11 @@ class InteropTest(tf.test.TestCase):
     self.assertIsInstance(c, np.ndarray)
     self.assertEqual(c.shape, (batch_size, 32, 32, 32, 32))
 
+    c = tf.vectorized_map(lambda x: x.T, a)
+
+    self.assertIsInstance(c, np.ndarray)
+    self.assertEqual(c.shape, (batch_size, 32, 32))
+
   def testJacobian(self):
     with tf.GradientTape() as g:
       x = np.asarray([1., 2.])
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index 1e2ecdbea7b..d4490502dba 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -59,6 +59,11 @@ class ArrayTest(PForTestCase):
         outputs.append(array_ops.gather(y, [i, 1, 2], axis=2, batch_dims=1))
         outputs.append(array_ops.gather(y, [[2, i], [i, 1], [2, 1]],
                                         axis=-1, batch_dims=1))
+        outputs.append(
+            array_ops.gather(y, [[0, 1, 2]] * 3, axis=2, batch_dims=2))
+        outputs.append(array_ops.gather(y, [0, 1, 2], axis=1, batch_dims=-1))
+        outputs.append(
+            array_ops.gather(y, [[0, 1, 2]] * 3, axis=2, batch_dims=-2))
 
       return outputs
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index e7a5c38381e..b60bc210e9b 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -357,7 +357,10 @@ def _broadcasting_gather(x, i):
     i = 0
   elif static_first_dim is None:
     i = array_ops.where_v2(array_ops.shape(x)[0] > 1, i, 0)
-  return array_ops.gather(x, i)
+  result = array_ops.gather(x, i)
+  if isinstance(x, np_arrays.ndarray):
+    result = np_arrays.ndarray.from_tensor(result)
+  return result
 
 
 @tf_export("vectorized_map")
@@ -450,7 +453,11 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
   Raises:
     ValueError: If vectorization fails and fallback_to_while_loop is False.
   """
-  elems = nest.map_structure(ops.convert_to_tensor, elems)
+  def _convert_to_tensor_or_ndarray(x):
+    if isinstance(x, np_arrays.ndarray):
+      return x
+    return ops.convert_to_tensor(x)
+  elems = nest.map_structure(_convert_to_tensor_or_ndarray, elems)
 
   def loop_fn(i):
     gathered_elems = nest.map_structure(lambda x: _broadcasting_gather(x, i),
@@ -459,9 +466,13 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
 
   # Extract batch size from the maximum first dimension of any element.
   flat_elems = nest.flatten(elems)
-  static_first_dims = [elem.shape.as_list()[0]
-                       if elem.shape.rank is not None else None
-                       for elem in flat_elems]
+  def _get_shape(x):
+    if isinstance(x, np_arrays.ndarray):
+      x = x.data
+    if x.shape.rank is None:
+      return None
+    return x.shape.as_list()[0]
+  static_first_dims = [_get_shape(elem) for elem in flat_elems]
   if any([s is None for s in static_first_dims]):
     batch_size = math_ops.reduce_max(
         [array_ops.shape(elem)[0] for elem in flat_elems])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index d14ad1e5cba..cde1e6a9957 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -2275,7 +2275,11 @@ def _convert_gather(pfor_input):
         # it must be picking up all the rows of param.
         return wrap(param, True)
 
-    if batch_dims > 0:
+    if batch_dims != 0:
+      # Convert `batch_dims` to its positive equivalent if necessary.
+      batch_dims_pos = batch_dims
+      if batch_dims < 0:
+        batch_dims_pos += array_ops.rank(indices)
       # In order to maintain
       #   indices.shape[:batch_dims] == params.shape[:batch_dims]
       # with stacked indices, we move the first dimension of `indices` to the
@@ -2283,8 +2287,9 @@ def _convert_gather(pfor_input):
       # inserted into the shape of `output` at the `axis` dimension, which is
       # then transposed to the front (below).
       order = array_ops.concat([
-          (list(range(1, batch_dims + 1)) + [0]),
-          math_ops.range(batch_dims + 1, array_ops.rank(indices))], axis=0)
+          math_ops.range(1, batch_dims_pos + 1),
+          [0],
+          math_ops.range(batch_dims_pos + 1, array_ops.rank(indices))], axis=0)
       indices = array_ops.transpose(indices, order)
 
     output = array_ops.gather(
@@ -2310,7 +2315,7 @@ def _convert_gather(pfor_input):
     output = array_ops.gather(
         param, indices,
         axis=array_ops.where(axis >= 0, axis + 1, axis),
-        batch_dims=batch_dims + 1)
+        batch_dims=(batch_dims + 1 if batch_dims >= 0 else batch_dims))
     return wrap(output, True)
 
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 7b319e4270e..5d4eeba2994 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -55,7 +55,6 @@ from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
-
 acd.register_read_only_resource_op("ReadVariableOp")
 acd.register_read_only_resource_op("VariableShape")
 acd.register_read_only_resource_op("ResourceGather")
@@ -99,8 +98,10 @@ def _set_handle_shapes_and_types(tensor, handle_data, graph_mode):
   shapes, types = zip(*[(pair.shape, pair.dtype)
                         for pair in handle_data.shape_and_type])
   ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
-  shapes = [[d.size for d in s.dim]  # pylint: disable=g-complex-comprehension
-            if not s.unknown_rank else None for s in shapes]
+  shapes = [
+      [d.size for d in s.dim]  # pylint: disable=g-complex-comprehension
+      if not s.unknown_rank else None for s in shapes
+  ]
   pywrap_tf_session.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
       tensor._op._graph._c_graph,  # pylint: disable=protected-access
       tensor._as_tf_output(),  # pylint: disable=protected-access
@@ -134,29 +135,33 @@ def _combine_handle_data(handle, initial_value):
 
   extra_handle_data = get_eager_safe_handle_data(initial_value)
   if extra_handle_data is not None and extra_handle_data.is_set:
-    if (variable_handle_data is None
-        or not variable_handle_data.is_set
-        or len(variable_handle_data.shape_and_type) != 1):
+    if (variable_handle_data is None or not variable_handle_data.is_set or
+        len(variable_handle_data.shape_and_type) != 1):
       raise RuntimeError(
           "Expected VarHandleOp to return a length==1 shape_and_type, "
           "but saw: '%s'" % (variable_handle_data,))
-    variable_handle_data.shape_and_type.extend(
-        extra_handle_data.shape_and_type)
+    variable_handle_data.shape_and_type.extend(extra_handle_data.shape_and_type)
   return variable_handle_data
 
 
-def _variable_handle_from_shape_and_dtype(
-    shape, dtype, shared_name, name, graph_mode, initial_value=None):
+def _variable_handle_from_shape_and_dtype(shape,
+                                          dtype,
+                                          shared_name,
+                                          name,
+                                          graph_mode,
+                                          initial_value=None):
   """Create a variable handle, copying in handle data from `initial_value`."""
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
   shape = tensor_shape.as_shape(shape)
   dtype = dtypes.as_dtype(dtype)
-  handle = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                   shared_name=shared_name,
-                                                   name=name,
-                                                   container=container)
+  handle = gen_resource_variable_ops.var_handle_op(
+      shape=shape,
+      dtype=dtype,
+      shared_name=shared_name,
+      name=name,
+      container=container)
   if initial_value is None:
     initial_value = handle
   if graph_mode:
@@ -174,7 +179,8 @@ def _variable_handle_from_shape_and_dtype(
     # compatible with ASYNC execution mode. Further, since not all devices
     # support string tensors, we encode the assertion string in the Op name
     gen_logging_ops._assert(  # pylint: disable=protected-access
-        math_ops.logical_not(exists), [exists], name="EagerVariableNameReuse")
+        math_ops.logical_not(exists), [exists],
+        name="EagerVariableNameReuse")
 
     handle_data = cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData()
     handle_data.is_set = True
@@ -185,13 +191,11 @@ def _variable_handle_from_shape_and_dtype(
     if initial_value is not None and initial_value.dtype == dtypes.variant:
       extra_handle_data = get_eager_safe_handle_data(initial_value)
       if extra_handle_data is not None and extra_handle_data.is_set:
-        if (not handle_data.is_set
-            or len(handle_data.shape_and_type) != 1):
+        if (not handle_data.is_set or len(handle_data.shape_and_type) != 1):
           raise RuntimeError(
               "Expected VarHandleOp to return a length==1 shape_and_type, "
               "but saw: '%s'" % (handle_data,))
-        handle_data.shape_and_type.extend(
-            extra_handle_data.shape_and_type)
+        handle_data.shape_and_type.extend(extra_handle_data.shape_and_type)
 
     _set_handle_shapes_and_types(handle, handle_data, graph_mode)
     return handle
@@ -231,8 +235,8 @@ def eager_safe_variable_handle(initial_value, shape, shared_name, name,
 
   Args:
     initial_value: A `Tensor`.
-    shape: The shape of the handle data. Can be `TensorShape(None)`
-      (i.e. unknown shape).
+    shape: The shape of the handle data. Can be `TensorShape(None)` (i.e.
+      unknown shape).
     shared_name: A string.
     name: A string.
     graph_mode: A python bool.
@@ -241,16 +245,16 @@ def eager_safe_variable_handle(initial_value, shape, shared_name, name,
     The handle, a `Tensor` of type `resource`.
   """
   dtype = initial_value.dtype.base_dtype
-  return _variable_handle_from_shape_and_dtype(
-      shape, dtype, shared_name, name, graph_mode, initial_value)
+  return _variable_handle_from_shape_and_dtype(shape, dtype, shared_name, name,
+                                               graph_mode, initial_value)
 
 
 @contextlib.contextmanager
 def _handle_graph(handle):
   # Note: might have an eager tensor but not be executing eagerly when building
   # functions.
-  if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
-      or ops.has_default_graph()):
+  if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor) or
+      ops.has_default_graph()):
     yield
   else:
     with handle.graph.as_default():
@@ -313,9 +317,8 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
   with _handle_graph(handle):
     value_tensor = ops.convert_to_tensor(value)
   shape.assert_is_compatible_with(value_tensor.shape)
-  return gen_resource_variable_ops.assign_variable_op(handle,
-                                                      value_tensor,
-                                                      name=name)
+  return gen_resource_variable_ops.assign_variable_op(
+      handle, value_tensor, name=name)
 
 
 def _maybe_set_handle_data(dtype, handle, tensor):
@@ -326,8 +329,7 @@ def _maybe_set_handle_data(dtype, handle, tensor):
     if handle_data.is_set and len(handle_data.shape_and_type) > 1:
       tensor._handle_data = (  # pylint: disable=protected-access
           cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
-              is_set=True,
-              shape_and_type=handle_data.shape_and_type[1:]))
+              is_set=True, shape_and_type=handle_data.shape_and_type[1:]))
 
 
 def variable_accessed(variable):
@@ -376,14 +378,14 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -396,8 +398,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
         tensor which reads this variable's value.
       initial_value: Optional. Variable's initial value.
       initializer_op: Operation which assigns the variable's initial value.
-      is_initialized_op: Pre-created operation to check whether this variable
-        is initialized.
+      is_initialized_op: Pre-created operation to check whether this variable is
+        initialized.
       cached_value: Pre-created operation to read this variable in a specific
         device.
       save_slice_info: Metadata for variable partitioning.
@@ -431,7 +433,6 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     self._shape = tensor_shape.as_shape(shape)
     self._dtype = dtypes.as_dtype(dtype)
     self._handle = handle
-    self._graph_element = graph_element
     self._unique_id = unique_id
     self._handle_name = handle_name + ":0"
     self._constraint = constraint
@@ -651,8 +652,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       other Op modifies this variable, the values produced will all be
       distinct.
     """
-    return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
-                                              T=self.dtype)
+    return gen_state_ops.resource_count_up_to(
+        self.handle, limit=limit, T=self.dtype)
 
   def _map_resources(self, save_options):
     """For implementing `Trackable`."""
@@ -670,8 +671,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     variable_accessed(self)
 
     def read_and_set_handle():
-      result = gen_resource_variable_ops.read_variable_op(self._handle,
-                                                          self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(
+          self._handle, self._dtype)
       _maybe_set_handle_data(self._dtype, self._handle, result)
       return result
 
@@ -720,8 +721,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
         if handle_data.is_set and len(handle_data.shape_and_type) > 1:
           value._handle_data = (  # pylint: disable=protected-access
               cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
-                  is_set=True,
-                  shape_and_type=handle_data.shape_and_type[1:]))
+                  is_set=True, shape_and_type=handle_data.shape_and_type[1:]))
 
     return array_ops.identity(value)
 
@@ -810,7 +810,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       use_locking: If `True`, use locking during the operation.
       name: The name to use for the operation.
       read_value: A `bool`. Whether to read and return the new value of the
-          variable or not.
+        variable or not.
 
     Returns:
       If `read_value` is `True`, this method will return the new value of the
@@ -823,7 +823,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     # don't need it.
     with _handle_graph(self.handle), self._assign_dependencies():
       assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
-          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          self.handle,
+          ops.convert_to_tensor(delta, dtype=self.dtype),
           name=name)
     if read_value:
       return self._lazy_read(assign_sub_op)
@@ -837,7 +838,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       use_locking: If `True`, use locking during the operation.
       name: The name to use for the operation.
       read_value: A `bool`. Whether to read and return the new value of the
-          variable or not.
+        variable or not.
 
     Returns:
       If `read_value` is `True`, this method will return the new value of the
@@ -847,7 +848,8 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     with _handle_graph(self.handle), self._assign_dependencies():
       assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
-          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          self.handle,
+          ops.convert_to_tensor(delta, dtype=self.dtype),
           name=name)
     if read_value:
       return self._lazy_read(assign_add_op)
@@ -856,10 +858,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
   def _lazy_read(self, op):
     variable_accessed(self)
     return _UnreadVariable(
-        handle=self._handle, dtype=self.dtype, shape=self._shape,
+        handle=self._handle,
+        dtype=self.dtype,
+        shape=self._shape,
         in_graph_mode=self._in_graph_mode,
         deleter=self._handle_deleter if not self._in_graph_mode else None,
-        parent_op=op, unique_id=self._unique_id)
+        parent_op=op,
+        unique_id=self._unique_id)
 
   def assign(self, value, use_locking=None, name=None, read_value=True):
     """Assigns a new value to this variable.
@@ -869,7 +874,7 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
       use_locking: If `True`, use locking during the assignment.
       name: The name to use for the assignment.
       read_value: A `bool`. Whether to read and return the new value of the
-          variable or not.
+        variable or not.
 
     Returns:
       If `read_value` is `True`, this method will return the new value of the
@@ -881,7 +886,15 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     # initialize the variable.
     with _handle_graph(self.handle):
       value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
-      self._shape.assert_is_compatible_with(value_tensor.shape)
+      if not self._shape.is_compatible_with(value_tensor.shape):
+        if self.name is None:
+          tensor_name = ""
+        else:
+          tensor_name = " " + str(self.name)
+        raise ValueError(
+            ("Cannot assign to variable%s due to variable shape %s and value "
+             "shape %s are incompatible") %
+            (tensor_name, self._shape, value_tensor.shape))
       assign_op = gen_resource_variable_ops.assign_variable_op(
           self.handle, value_tensor, name=name)
       if read_value:
@@ -915,9 +928,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_sub(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_sub(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_add(self, sparse_delta, use_locking=False, name=None):
     """Adds `tf.IndexedSlices` to this variable.
@@ -935,16 +951,19 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_add(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_add(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_max(self, sparse_delta, use_locking=False, name=None):
     """Updates this variable with the max of `tf.IndexedSlices` and itself.
 
     Args:
-      sparse_delta: `tf.IndexedSlices` to use as an argument of max
-        with this variable.
+      sparse_delta: `tf.IndexedSlices` to use as an argument of max with this
+        variable.
       use_locking: If `True`, use locking during the operation.
       name: the name of the operation.
 
@@ -956,16 +975,19 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_max(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_max(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_min(self, sparse_delta, use_locking=False, name=None):
     """Updates this variable with the min of `tf.IndexedSlices` and itself.
 
     Args:
-      sparse_delta: `tf.IndexedSlices` to use as an argument of min
-        with this variable.
+      sparse_delta: `tf.IndexedSlices` to use as an argument of min with this
+        variable.
       use_locking: If `True`, use locking during the operation.
       name: the name of the operation.
 
@@ -977,9 +999,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_min(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_min(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_mul(self, sparse_delta, use_locking=False, name=None):
     """Multiply this variable by `tf.IndexedSlices`.
@@ -997,9 +1022,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_mul(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_mul(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_div(self, sparse_delta, use_locking=False, name=None):
     """Divide this variable by `tf.IndexedSlices`.
@@ -1017,9 +1045,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_div(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_div(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
     """Assigns `tf.IndexedSlices` to this variable.
@@ -1037,9 +1068,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(gen_resource_variable_ops.resource_scatter_update(
-        self.handle, sparse_delta.indices,
-        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+    return self._lazy_read(
+        gen_resource_variable_ops.resource_scatter_update(
+            self.handle,
+            sparse_delta.indices,
+            ops.convert_to_tensor(sparse_delta.values, self.dtype),
+            name=name))
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
     """Assigns `tf.IndexedSlices` to this variable batch-wise.
@@ -1087,9 +1121,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise TypeError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return self._lazy_read(state_ops.batch_scatter_update(
-        self, sparse_delta.indices, sparse_delta.values,
-        use_locking=use_locking, name=name))
+    return self._lazy_read(
+        state_ops.batch_scatter_update(
+            self,
+            sparse_delta.indices,
+            sparse_delta.values,
+            use_locking=use_locking,
+            name=name))
 
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
@@ -1136,9 +1174,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       The updated variable.
     """
-    return self._lazy_read(gen_state_ops.resource_scatter_nd_sub(
-        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
-        name=name))
+    return self._lazy_read(
+        gen_state_ops.resource_scatter_nd_sub(
+            self.handle,
+            indices,
+            ops.convert_to_tensor(updates, self.dtype),
+            name=name))
 
   def scatter_nd_add(self, indices, updates, name=None):
     """Applies sparse addition to individual values or slices in a Variable.
@@ -1185,9 +1226,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       The updated variable.
     """
-    return self._lazy_read(gen_state_ops.resource_scatter_nd_add(
-        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
-        name=name))
+    return self._lazy_read(
+        gen_state_ops.resource_scatter_nd_add(
+            self.handle,
+            indices,
+            ops.convert_to_tensor(updates, self.dtype),
+            name=name))
 
   def scatter_nd_update(self, indices, updates, name=None):
     """Applies sparse assignment to individual values or slices in a Variable.
@@ -1234,9 +1278,12 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       The updated variable.
     """
-    return self._lazy_read(gen_state_ops.resource_scatter_nd_update(
-        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
-        name=name))
+    return self._lazy_read(
+        gen_state_ops.resource_scatter_nd_update(
+            self.handle,
+            indices,
+            ops.convert_to_tensor(updates, self.dtype),
+            name=name))
 
   def scatter_nd_max(self, indices, updates, name=None):
     """Updates this variable with the max of `tf.IndexedSlices` and itself.
@@ -1442,29 +1489,30 @@ class ResourceVariable(BaseResourceVariable):
   ```
   """
 
-  def __init__(self,  # pylint: disable=super-init-not-called
-               initial_value=None,
-               trainable=None,
-               collections=None,
-               validate_shape=True,  # pylint: disable=unused-argument
-               caching_device=None,
-               name=None,
-               dtype=None,
-               variable_def=None,
-               import_scope=None,
-               constraint=None,
-               distribute_strategy=None,
-               synchronization=None,
-               aggregation=None,
-               shape=None):
+  def __init__(
+      self,  # pylint: disable=super-init-not-called
+      initial_value=None,
+      trainable=None,
+      collections=None,
+      validate_shape=True,  # pylint: disable=unused-argument
+      caching_device=None,
+      name=None,
+      dtype=None,
+      variable_def=None,
+      import_scope=None,
+      constraint=None,
+      distribute_strategy=None,
+      synchronization=None,
+      aggregation=None,
+      shape=None):
     """Creates a variable.
 
     Args:
       initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. Can also be a
-        callable with no argument that returns the initial value when called.
-        (Note that initializer functions from init_ops.py must first be bound
-        to a shape before being used here.)
+        which is the initial value for the Variable. Can also be a callable with
+        no argument that returns the initial value when called. (Note that
+        initializer functions from init_ops.py must first be bound to a shape
+        before being used here.)
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
@@ -1480,10 +1528,9 @@ class ResourceVariable(BaseResourceVariable):
         deduplicate copying through `Switch` and other conditional statements.
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type.
-        If None, either the datatype will be kept (if initial_value is
-        a Tensor) or float32 will be used (if it is a Python object convertible
-        to a Tensor).
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
       variable_def: `VariableDef` protocol buffer. If not None, recreates the
         `ResourceVariable` object with its contents. `variable_def` and other
         arguments (except for import_scope) are mutually exclusive.
@@ -1493,16 +1540,16 @@ class ResourceVariable(BaseResourceVariable):
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       distribute_strategy: The tf.distribute.Strategy this variable is being
         created inside of.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -1562,8 +1609,8 @@ class ResourceVariable(BaseResourceVariable):
         which is the initial value for the Variable. The initial value must have
         a shape specified unless `validate_shape` is set to False. Can also be a
         callable with no argument that returns the initial value when called.
-        (Note that initializer functions from init_ops.py must first be bound
-         to a shape before being used here.)
+        (Note that initializer functions from init_ops.py must first be bound to
+        a shape before being used here.)
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
@@ -1578,27 +1625,26 @@ class ResourceVariable(BaseResourceVariable):
         deduplicate copying through `Switch` and other conditional statements.
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
-      dtype: If set, initial_value will be converted to the given type.
-        If None, either the datatype will be kept (if initial_value is
-       a Tensor) or float32 will be used (if it is a Python object convertible
-       to a Tensor).
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
       constraint: An optional projection function to be applied to the variable
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
-      distribute_strategy: DistributionStrategy under which this variable
-        was created.
+      distribute_strategy: DistributionStrategy under which this variable was
+        created.
       shape: (optional) The shape of this variable. If None, the shape of
         `initial_value` will be used. When setting this argument to
         `tf.TensorShape(None)` (representing an unspecified shape), the variable
@@ -1675,7 +1721,8 @@ class ResourceVariable(BaseResourceVariable):
           with ops.name_scope("Initializer"), device_context_manager(None):
             initial_value = ops.convert_to_tensor(
                 initial_value() if init_from_fn else initial_value,
-                name="initial_value", dtype=dtype)
+                name="initial_value",
+                dtype=dtype)
           if shape is not None:
             if not initial_value.shape.is_compatible_with(shape):
               raise ValueError(
@@ -1715,8 +1762,7 @@ class ResourceVariable(BaseResourceVariable):
                   gen_resource_variable_ops.assign_variable_op(
                       handle,
                       variables._try_guard_against_uninitialized_dependencies(
-                          name,
-                          initial_value),
+                          name, initial_value),
                       name=n))
               # pylint: enable=protected-access
             # pylint: enable=g-backslash-continuation
@@ -1767,13 +1813,23 @@ class ResourceVariable(BaseResourceVariable):
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
       initial_value = initial_value if self._in_graph_mode else None
       super(ResourceVariable, self).__init__(
-          trainable=trainable, shape=shape, dtype=dtype, handle=handle,
-          synchronization=synchronization, constraint=constraint,
-          aggregation=aggregation, distribute_strategy=distribute_strategy,
-          name=name, unique_id=unique_id, handle_name=handle_name,
-          graph_element=graph_element, initial_value=initial_value,
-          initializer_op=initializer_op, is_initialized_op=is_initialized_op,
-          cached_value=cached_value, caching_device=caching_device)
+          trainable=trainable,
+          shape=shape,
+          dtype=dtype,
+          handle=handle,
+          synchronization=synchronization,
+          constraint=constraint,
+          aggregation=aggregation,
+          distribute_strategy=distribute_strategy,
+          name=name,
+          unique_id=unique_id,
+          handle_name=handle_name,
+          graph_element=graph_element,
+          initial_value=initial_value,
+          initializer_op=initializer_op,
+          is_initialized_op=is_initialized_op,
+          cached_value=cached_value,
+          caching_device=caching_device)
 
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
@@ -1789,8 +1845,7 @@ class ResourceVariable(BaseResourceVariable):
     self._handle = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.variable_name, import_scope=import_scope))
-    self._shape = tensor_shape.TensorShape(
-        self._handle.op.get_attr("shape"))
+    self._shape = tensor_shape.TensorShape(self._handle.op.get_attr("shape"))
     self._handle_name = self._handle.name
     self._unique_id = self._handle_name
     self._initializer_op = g.as_graph_element(
@@ -1800,16 +1855,14 @@ class ResourceVariable(BaseResourceVariable):
     if (hasattr(variable_def, "initial_value_name") and
         variable_def.initial_value_name):
       self._initial_value = g.as_graph_element(
-          ops.prepend_name_scope(variable_def.initial_value_name,
-                                 import_scope=import_scope))
+          ops.prepend_name_scope(
+              variable_def.initial_value_name, import_scope=import_scope))
     else:
       self._initial_value = None
     synchronization, aggregation, trainable = (
         variables.validate_synchronization_aggregation_trainable(
-            variable_def.synchronization,
-            variable_def.aggregation,
-            variable_def.trainable,
-            variable_def.variable_name))
+            variable_def.synchronization, variable_def.aggregation,
+            variable_def.trainable, variable_def.variable_name))
     self._synchronization = synchronization
     self._aggregation = aggregation
     self._trainable = trainable
@@ -1828,8 +1881,8 @@ class ResourceVariable(BaseResourceVariable):
       self._cached_value = None
       # Legacy case for protos without the snapshot name; assume it's the
       # following.
-      self._graph_element = g.get_tensor_by_name(
-          self._handle.op.name + "/Read/ReadVariableOp:0")
+      self._graph_element = g.get_tensor_by_name(self._handle.op.name +
+                                                 "/Read/ReadVariableOp:0")
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
           save_slice_info_def=variable_def.save_slice_info_def,
@@ -1875,14 +1928,14 @@ class UninitializedVariable(BaseResourceVariable):
         after being updated by an `Optimizer` (e.g. used to implement norm
         constraints or value constraints for layer weights). The function must
         take as input the unprojected Tensor representing the value of the
-        variable and return the Tensor for the projected value
-        (which must have the same shape). Constraints are not safe to
-        use when doing asynchronous distributed training.
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
         `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize.
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
@@ -1903,8 +1956,11 @@ class UninitializedVariable(BaseResourceVariable):
           unique_id = "%s_%d" % (handle_name, ops.uid())
           shared_name = context.shared_name(unique_id)
         handle = _variable_handle_from_shape_and_dtype(
-            shape=shape, dtype=dtype, shared_name=shared_name,
-            name=name, graph_mode=self._in_graph_mode,
+            shape=shape,
+            dtype=dtype,
+            shared_name=shared_name,
+            name=name,
+            graph_mode=self._in_graph_mode,
             initial_value=extra_handle_data)
         if not context.executing_eagerly():
           with ops.name_scope("Read"):
@@ -1921,10 +1977,17 @@ class UninitializedVariable(BaseResourceVariable):
         else:
           graph_element = None
     super(UninitializedVariable, self).__init__(
-        distribute_strategy=distribute_strategy, shape=shape, dtype=dtype,
-        unique_id=unique_id, handle_name=handle_name, constraint=constraint,
-        handle=handle, graph_element=graph_element, trainable=trainable,
-        synchronization=synchronization, aggregation=aggregation)
+        distribute_strategy=distribute_strategy,
+        shape=shape,
+        dtype=dtype,
+        unique_id=unique_id,
+        handle_name=handle_name,
+        constraint=constraint,
+        handle=handle,
+        graph_element=graph_element,
+        trainable=trainable,
+        synchronization=synchronization,
+        aggregation=aggregation)
 
 
 _pywrap_utils.RegisterType("ResourceVariable", ResourceVariable)
@@ -1947,8 +2010,8 @@ class _UnreadVariable(BaseResourceVariable):
   Pretends to be the tensor if anyone looks.
   """
 
-  def __init__(self, handle, dtype, shape, in_graph_mode, deleter,
-               parent_op, unique_id):
+  def __init__(self, handle, dtype, shape, in_graph_mode, deleter, parent_op,
+               unique_id):
     if isinstance(handle, ops.EagerTensor):
       handle_name = ""
     else:
@@ -1964,8 +2027,12 @@ class _UnreadVariable(BaseResourceVariable):
             handle, dtype)
         _maybe_set_handle_data(dtype, handle, graph_element)
     super(_UnreadVariable, self).__init__(
-        handle=handle, shape=shape, handle_name=handle_name,
-        unique_id=unique_id, dtype=dtype, handle_deleter=deleter,
+        handle=handle,
+        shape=shape,
+        handle_name=handle_name,
+        unique_id=unique_id,
+        dtype=dtype,
+        handle_deleter=deleter,
         graph_element=graph_element)
     self._parent_op = parent_op
 
@@ -1984,8 +2051,8 @@ class _UnreadVariable(BaseResourceVariable):
 
   def _read_variable_op(self):
     with ops.control_dependencies([self._parent_op]):
-      result = gen_resource_variable_ops.read_variable_op(self._handle,
-                                                          self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(
+          self._handle, self._dtype)
       _maybe_set_handle_data(self._dtype, self._handle, result)
       return result
 
@@ -2036,13 +2103,13 @@ class _UnreadVariable(BaseResourceVariable):
 
   def scatter_update(self, sparse_delta, use_locking=False, name=None):
     with ops.control_dependencies([self._parent_op]):
-      return super(_UnreadVariable, self).scatter_update(sparse_delta,
-                                                         use_locking, name)
+      return super(_UnreadVariable,
+                   self).scatter_update(sparse_delta, use_locking, name)
 
   def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
     with ops.control_dependencies([self._parent_op]):
-      return super(_UnreadVariable, self).batch_scatter_update(
-          sparse_delta, use_locking, name)
+      return super(_UnreadVariable,
+                   self).batch_scatter_update(sparse_delta, use_locking, name)
 
   def scatter_nd_sub(self, indices, updates, name=None):
     with ops.control_dependencies([self._parent_op]):
@@ -2054,8 +2121,8 @@ class _UnreadVariable(BaseResourceVariable):
 
   def scatter_nd_update(self, indices, updates, name=None):
     with ops.control_dependencies([self._parent_op]):
-      return super(_UnreadVariable, self).scatter_nd_update(indices, updates,
-                                                            name)
+      return super(_UnreadVariable,
+                   self).scatter_nd_update(indices, updates, name)
 
   def scatter_nd_max(self, indices, updates, name=None):
     with ops.control_dependencies([self._parent_op]):
@@ -2078,8 +2145,8 @@ def _ReadGrad(_, grad):
 
 
 def variable_shape(handle, out_type=dtypes.int32):
-  if getattr(
-      handle, "_handle_data", None) is None or not handle._handle_data.is_set:  # pylint: disable=protected-access
+  if getattr(handle, "_handle_data",
+             None) is None or not handle._handle_data.is_set:  # pylint: disable=protected-access
     return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
   shape_proto = handle._handle_data.shape_and_type[0].shape  # pylint: disable=protected-access
   if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim):
@@ -2174,6 +2241,7 @@ def copy_to_graph_uninitialized(var):
   # pylint: enable=protected-access
   return new_variable
 
+
 ops.NotDifferentiable("Assert")
 ops.NotDifferentiable("VarIsInitializedOp")
 ops.NotDifferentiable("VariableShape")
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 58dc92084a6..2cd9a0161f7 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -220,6 +220,24 @@ class _GraphTensorArray(object):
       with ops.colocate_with(self._colocate_with[0]):
         yield
 
+  @contextlib.contextmanager
+  def _colocate_with_first_write_or_handle(self):
+    """Colocates ops with the handle or the first write.
+
+    In the case of colocate_with_first_write_call, the device for _handle is not
+    updated and remains empty. Colocating things with that just propagates the
+    empty device assignment, so we colocate with the first write op instead.
+
+    Yields:
+      Nothing but the appropriate colocation context.
+    """
+    if not self._colocate_with:
+      with ops.colocate_with(self._handle):
+        yield
+    else:
+      with ops.colocate_with(self._colocate_with[0]):
+        yield
+
   def identity(self):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
@@ -234,7 +252,7 @@ class _GraphTensorArray(object):
     if flow is None:
       flow = self.flow
     with ops.name_scope(name, "TensorArrayGrad", [self._handle]):
-      with ops.colocate_with(self._handle):
+      with self._colocate_with_first_write_or_handle():
         g_handle, unused_flow = gen_data_flow_ops.tensor_array_grad_v3(
             handle=self._handle, source=source, flow_in=flow, name=name)
         with ops.control_dependencies([g_handle]):
@@ -281,7 +299,7 @@ class _GraphTensorArray(object):
 
   def stack(self, name=None):
     """See TensorArray."""
-    with ops.colocate_with(self._handle):
+    with self._colocate_with_first_write_or_handle():
       with ops.name_scope(name, "TensorArrayStack", [self._handle]):
         value = self.gather(math_ops.range(0, self.size()), name=name)
         if (self.element_shape and not self._dynamic_size and
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 30d4c6d235a..23c24476934 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -520,6 +520,12 @@ def _preprocess_grad(grad, body_graph_output, while_op_input, while_op_output):
       default_gradient.supports_default_grad(while_op_input) and grad is None):
     return _zeros_like(while_op_input, while_op_output)
 
+  # Convert IndexedSlices to dense tensors since it is unlikely that downstream
+  # gradient functions with properly handle indexed slices. This is similar to
+  # what we do in tf.function gradients.
+  if isinstance(grad, ops.IndexedSlices):
+    return ops.convert_to_tensor(grad)
+
   return grad
 
 
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 5956297c2e4..401201018d9 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -130,31 +130,36 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
     profiler_server.release();
   });
 
-  m.def("trace", [](const char* service_addr, const char* logdir,
-                    const char* worker_list, bool include_dataset_ops,
-                    int duration_ms, int num_tracing_attempts,
-                    py::dict options) {
-    tensorflow::Status status = ValidateHostPortPair(service_addr);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    tensorflow::ProfileOptions opts = GetOptions(options);
-    opts.set_include_dataset_ops(include_dataset_ops);
-    status =
-        tensorflow::profiler::Trace(service_addr, logdir, worker_list,
-                                    duration_ms, num_tracing_attempts, opts);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-  });
+  m.def(
+      "trace",
+      [](const char* service_addr, const char* logdir, const char* worker_list,
+         bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
+         py::dict options) {
+        tensorflow::Status status = ValidateHostPortPair(service_addr);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+        tensorflow::ProfileOptions opts = GetOptions(options);
+        opts.set_include_dataset_ops(include_dataset_ops);
+        status = tensorflow::profiler::Trace(service_addr, logdir, worker_list,
+                                             duration_ms, num_tracing_attempts,
+                                             opts);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+      },
+      py::call_guard<py::gil_scoped_release>());
 
-  m.def("monitor", [](const char* service_addr, int duration_ms,
-                      int monitoring_level, bool display_timestamp) {
-    tensorflow::Status status = ValidateHostPortPair(service_addr);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    tensorflow::string content;
-    status = tensorflow::profiler::Monitor(service_addr, duration_ms,
-                                           monitoring_level, display_timestamp,
-                                           &content);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
-    return content;
-  });
+  m.def(
+      "monitor",
+      [](const char* service_addr, int duration_ms, int monitoring_level,
+         bool display_timestamp) {
+        tensorflow::Status status = ValidateHostPortPair(service_addr);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+        tensorflow::string content;
+        status = tensorflow::profiler::Monitor(service_addr, duration_ms,
+                                               monitoring_level,
+                                               display_timestamp, &content);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
+        return content;
+      },
+      py::call_guard<py::gil_scoped_release>());
 
   m.def("xspace_to_trace_events", [](const py::bytes& serialized_xspace_proto) {
     tensorflow::string content;
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 33780c14db8..361883adc22 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -757,6 +757,11 @@ def _write_object_proto(obj, proto, asset_file_def_index, function_name_map):
     proto.variable.synchronization = obj.synchronization.value
     proto.variable.aggregation = obj.aggregation.value
     proto.variable.shape.CopyFrom(obj.shape.as_proto())
+    options = save_context.get_save_options()
+    if options.experimental_variable_policy._save_variable_devices(  # pylint: disable=protected-access
+    ):
+      if hasattr(obj, "device"):
+        proto.variable.device = obj.device
   elif isinstance(obj, def_function.Function):
     proto.function.CopyFrom(function_serialization.serialize_function(
         obj, function_name_map))
@@ -1005,8 +1010,8 @@ def save(obj, export_dir, signatures=None, options=None):
   utils_impl.get_or_create_variables_dir(export_dir)
   ckpt_options = checkpoint_options.CheckpointOptions(
       experimental_io_device=options.experimental_io_device)
-  object_saver.save(utils_impl.get_variables_path(export_dir),
-                    options=ckpt_options)
+  object_saver.save(
+      utils_impl.get_variables_path(export_dir), options=ckpt_options)
   builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
                                               export_dir)
   # Note that this needs to be the last file operation when saving the
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index c59b13144cd..d74d190f37e 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -514,12 +514,14 @@ class SaveTest(test.TestCase, parameterized.TestCase):
     else:
       save.save(obj=root, export_dir=file_name, options=options)
 
-    graph_def = None
+    meta = None
     if meta_graph_only:
-      graph_def = meta_graph.read_meta_graph_file(file_name).graph_def
+      meta = meta_graph.read_meta_graph_file(file_name)
     else:
-      graph_def = loader_impl.parse_saved_model(
-          file_name).meta_graphs[0].graph_def
+      meta = loader_impl.parse_saved_model(file_name).meta_graphs[0]
+
+    # Check devices in meta graph nodes.
+    graph_def = meta.graph_def
     v0 = next((n for n in graph_def.node if n.name == "v0"), None)
     v1 = next((n for n in graph_def.node if n.name == "v1"), None)
     self.assertIsNotNone(v0)
@@ -531,6 +533,23 @@ class SaveTest(test.TestCase, parameterized.TestCase):
       self.assertEmpty(v0.device)
       self.assertEmpty(v1.device)
 
+    # Check devices in object graph nodes.
+    object_graph_def = meta.object_graph_def
+    v0 = next((n.variable
+               for n in object_graph_def.nodes
+               if n.HasField("variable") and n.variable.name == "v0"), None)
+    v1 = next((n.variable
+               for n in object_graph_def.nodes
+               if n.HasField("variable") and n.variable.name == "v1"), None)
+    self.assertIsNotNone(v0)
+    self.assertIsNotNone(v1)
+    if save_devices == save_options.VariablePolicy.SAVE_VARIABLE_DEVICES:
+      self.assertIn("CPU:0", v0.device)
+      self.assertIn("CPU:1", v1.device)
+    else:
+      self.assertEmpty(v0.device)
+      self.assertEmpty(v1.device)
+
   @parameterized.named_parameters(
       ("_ExpandDistributedVariables",
        save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES),
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 899dd61d172..17ef2ee05c3 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -262,6 +262,18 @@ def get_or_create_debug_dir(export_dir):
   return debug_dir
 
 
+def get_saved_model_pbtxt_path(export_dir):
+  return os.path.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+
+
+def get_saved_model_pb_path(export_dir):
+  return os.path.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+
 def get_debug_dir(export_dir):
   """Returns path to the debug sub-directory in the SavedModel."""
   return os.path.join(
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 0afd05e94cb..302bb20eb20 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -558,7 +558,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
               tensorflow::InputTFE_Context(ctx), policy);
         });
   m.def("TFE_ContextSetServerDef", [](py::handle& ctx, int keep_alive_secs,
-                                      py::str proto) {
+                                      py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -568,7 +568,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
   m.def("TFE_ContextUpdateServerDef", [](py::handle& ctx, int keep_alive_secs,
-                                         py::str proto) {
+                                         py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -848,7 +848,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_NewContextOptions", &TFE_NewContextOptions,
         py::return_value_policy::reference);
   m.def("TFE_ContextOptionsSetConfig", [](TFE_ContextOptions* options,
-                                          py::str proto) {
+                                          py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
@@ -899,7 +899,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
           return tensorflow::PyoOrThrow(
               TFE_Py_EncodeArg(o.ptr(), include_tensor_ranks_only));
         });
-  m.def("TFE_EnableCollectiveOps", [](const py::handle& ctx, py::str proto) {
+  m.def("TFE_EnableCollectiveOps", [](const py::handle& ctx, py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     tensorflow::Safe_TF_BufferPtr buf =
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index 5a34d10420a..bf955ad825c 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -321,7 +321,8 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
   # Load the Variables so that we can freeze the graph.
   with session.Session(graph=ops_lib.Graph()) as sess:
     restorer = saver_lib.import_meta_graph(meta_graph_def, clear_devices=True)
-    restorer.restore(sess, checkpoint_path)
+    if restorer is not None:
+      restorer.restore(sess, checkpoint_path)
     graph_def.CopyFrom(
         graph_util.convert_variables_to_constants(
             sess,
diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index 2897320be91..fa13908b255 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -41,6 +41,8 @@ FLAGS = flags.FLAGS
 
 flags.DEFINE_bool('runtime_oom_exit', True,
                   'Exit the script when the TPU runtime is OOM.')
+flags.DEFINE_bool('hbm_oom_exit', True,
+                  'Exit the script when the TPU HBM is OOM.')
 
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
 _ENDPOINTS_SEPARATOR = ','
@@ -171,9 +173,8 @@ class Client(object):
     """Return the structured Symptom message."""
     return 'Symptom: ' + msg
 
-  def _oom_event(self):
+  def _oom_event(self, symptoms):
     """Check if a runtime OOM event is reported."""
-    symptoms = self.symptoms()
     if not symptoms:
       return False
     for symptom in reversed(symptoms):
@@ -193,6 +194,27 @@ class Client(object):
         return True
     return False
 
+  def _hbm_oom_event(self, symptoms):
+    """Check if a HBM OOM event is reported."""
+    if not symptoms:
+      return False
+    for symptom in reversed(symptoms):
+      if symptom['symptomType'] != 'HBM_OUT_OF_MEMORY':
+        continue
+      oom_datetime_str = symptom['createTime'].split('.')[0]
+      oom_datetime = datetime.datetime.strptime(oom_datetime_str,
+                                                '%Y-%m-%dT%H:%M:%S')
+      time_diff = _utcnow() - oom_datetime
+      if time_diff < datetime.timedelta(seconds=_OOM_EVENT_COOL_TIME_SEC):
+        logging.warning(self._symptom_msg(
+            'a recent HBM OOM has occured ~{} seconds ago. The model '
+            'script will terminate automatically. To prevent future HBM OOM '
+            'events, please consider reducing the model size. To disable this '
+            'behavior, set flag --hbm_oom_exit=false when starting the '
+            'script.'.format(time_diff.seconds)))
+        return True
+    return False
+
   def _tpu_service(self):
     """Creates a new Cloud TPU API object.
 
@@ -264,9 +286,12 @@ class Client(object):
     If false the TPU is in a unrecoverable state and should be recreated.
     """
     state = self.state()
+    symptoms = self.symptoms()
     if state and state in ['TERMINATED', 'PREEMPTED']:
       return False
-    elif FLAGS.runtime_oom_exit and self._oom_event():
+    elif FLAGS.runtime_oom_exit and self._oom_event(symptoms):
+      return False
+    elif FLAGS.hbm_oom_exit and self._hbm_oom_event(symptoms):
       return False
     return True
 
diff --git a/tensorflow/python/tpu/client/client_test.py b/tensorflow/python/tpu/client/client_test.py
index f53f09cd3d5..0b3e6854f72 100644
--- a/tensorflow/python/tpu/client/client_test.py
+++ b/tensorflow/python/tpu/client/client_test.py
@@ -472,6 +472,189 @@ class CloudTpuClientTest(test.TestCase):
                         service=self.mock_service_client(tpu_map=tpu_map))
       self.assertEqual(want, c.recoverable())
 
+  @mock.patch.object(client, '_request_compute_metadata',
+                     mock_request_compute_metadata)
+  @mock.patch.object(client, '_utcnow', mock_utcnow)
+  def testRecoverableHBMOOM(self):
+    test_cases = [
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+            }
+        }, True),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, False),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:28:20.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:28:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, False),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:28:20.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:00.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:10.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:20.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }, {
+                    'createTime': '2000-01-01T00:29:40.123456Z',
+                    'symptomType': 'LOW_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True)
+    ]
+
+    for tpu_map, want in test_cases:
+      c = client.Client(tpu='tpu_name',
+                        service=self.mock_service_client(tpu_map=tpu_map))
+      self.assertEqual(want, c.recoverable())
+
+  @mock.patch.object(client, '_request_compute_metadata',
+                     mock_request_compute_metadata)
+  @mock.patch.object(client, '_utcnow', mock_utcnow)
+  def testRecoverableHBMOOMDisabled(self):
+    test_cases = [
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+    ]
+
+    FLAGS.hbm_oom_exit = False
+    for tpu_map, want in test_cases:
+      c = client.Client(tpu='tpu_name',
+                        service=self.mock_service_client(tpu_map=tpu_map))
+      self.assertEqual(want, c.recoverable())
+    FLAGS.hbm_oom_exit = True
+
+  @mock.patch.object(client, '_request_compute_metadata',
+                     mock_request_compute_metadata)
+  @mock.patch.object(client, '_utcnow', mock_utcnow)
+  def testRecoverableHBMOOMNoAPI(self):
+    test_cases = [
+        ({
+            'projects/test-project/locations/us-central1-c/nodes/tpu_name': {
+                'state':
+                    'READY',
+                'symptoms': [{
+                    'createTime': '2000-01-01T00:29:30.123456Z',
+                    'symptomType': 'HBM_OUT_OF_MEMORY',
+                    'details': 'The TPU HBM has run OOM at timestamp '
+                               '2020-05-29T04:51:32.038721+00:00',
+                    'workerId': '0'
+                }]
+            }
+        }, True),
+    ]
+
+    for tpu_map, want in test_cases:
+      c = client.Client(tpu='grpc://1.2.3.4:8470',
+                        service=self.mock_service_client(tpu_map=tpu_map))
+      self.assertEqual(want, c.recoverable())
+
   @mock.patch.object(client, '_request_compute_metadata',
                      mock_request_compute_metadata)
   def testHealthApi(self):
diff --git a/tensorflow/python/tpu/client/version.py b/tensorflow/python/tpu/client/version.py
index a91586640fc..36f02a86878 100644
--- a/tensorflow/python/tpu/client/version.py
+++ b/tensorflow/python/tpu/client/version.py
@@ -18,4 +18,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-__version__ = "0.10"
+__version__ = "0.11"
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 412c7eb03d3..74f04bdd945 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as tf_device
@@ -41,6 +42,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu.ops import tpu_ops
@@ -377,11 +379,11 @@ class TPUEmbedding(tracking.AutoTrackable):
     # properly tracked by the tracking API.
     self._variables = self._create_variables_and_slots()
 
-    if self._using_tpu:
-      self._load_variables()
-
     self._built = True
 
+    # This is internally conditioned self._built and self._using_tpu
+    self._load_variables()
+
   def _maybe_build(self, batch_size):
     if not self._built:
       # This can be called while tracing a function, so we wrap the
@@ -411,6 +413,9 @@ class TPUEmbedding(tracking.AutoTrackable):
     # 1. Variables are stale and are only updated when a checkpoint is made.
     # 2. Updating the variables won't affect the actual tables on the TPU.
     if self._using_tpu:
+      if save_context.in_save_context():
+        return {table: self._variables[table.name]["parameters"].variables[0]
+                for table in self._table_config}
       raise RuntimeError("Unable to retrieve embedding tables when using a TPU "
                          "strategy. If you need access, save your model, "
                          "create this object under a CPU strategy and restore.")
@@ -824,61 +829,29 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     return variables
 
-  @def_function.function
   def _load_variables(self):
-    """Load embedding tables to onto TPU for each table and host."""
+    # Only load the variables if we are:
+    # 1) Using TPU
+    # 2) Variables are created
+    # 3) Not in save context (except if running eagerly)
+    if self._using_tpu and self._built and not (
+        not context.executing_eagerly() and save_context.in_save_context()):
+      _load_variables_impl(self._config_proto.SerializeToString(),
+                           self._hosts,
+                           self._variables,
+                           self._table_config)
 
-    def select_fn(host_id):
-      return lambda x: x.variables[host_id]
-
-    num_hosts = self._strategy.extended.num_hosts
-    config = self._config_proto.SerializeToString()
-    for host_id, host in enumerate(self._hosts):
-      variables = nest.map_structure(select_fn(host_id), self._variables)
-      with ops.device(host):
-        for table in self._table_config:
-          table.optimizer._load()(  # pylint: disable=protected-access
-              table_name=table.name,
-              num_shards=num_hosts,
-              shard_id=host_id,
-              config=config,
-              **variables[table.name])
-          # Ensure that only the first table/first host gets a config so that we
-          # don't bloat graph by attaching this large string to each op.
-          # We have num tables * num hosts of these so for models with a large
-          # number of tables training on a large slice, this can be an issue.
-          config = None
-
-  @def_function.function
   def _retrieve_variables(self):
-    """Retrieve embedding tables from TPU to host memory."""
-    num_hosts = self._strategy.extended.num_hosts
-    config = self._config_proto.SerializeToString()
-    for host_id, host in enumerate(self._hosts):
-      with ops.device(host):
-        for table in self._table_config:
-          retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
-              table_name=table.name,
-              num_shards=num_hosts,
-              shard_id=host_id,
-              config=config)
-          # When there are no slot variables (e.g with SGD) this returns a
-          # single tensor rather than a tuple. In this case we put the tensor in
-          # a list to make the following code easier to write.
-          if not isinstance(retrieved, tuple):
-            retrieved = (retrieved,)
-
-          for i, slot in enumerate(["parameters"] +
-                                   table.optimizer._slot_names()):  # pylint: disable=protected-access
-            # We must assign the CPU variables the values of tensors that were
-            # returned from the TPU.
-            self._variables[table.name][slot].variables[host_id].assign(
-                retrieved[i])
-          # Ensure that only the first table/first host gets a config so that we
-          # don't bloat graph by attaching this large string to each op.
-          # We have num tables * num hosts of these so for models with a large
-          # number of tables training on a large slice, this can be an issue.
-          config = None
+    # Only retrieve the variables if we are:
+    # 1) Using TPU
+    # 2) Variables are created
+    # 3) Not in save context (except if running eagerly)
+    if self._using_tpu and self._built and not (
+        not context.executing_eagerly() and save_context.in_save_context()):
+      _retrieve_variables_impl(self._config_proto.SerializeToString(),
+                               self._hosts,
+                               self._variables,
+                               self._table_config)
 
   def _gather_saveables_for_checkpoint(self):
     """Overrides default Trackable implementation to add load/retrieve hook."""
@@ -888,16 +861,9 @@ class TPUEmbedding(tracking.AutoTrackable):
     # always executed. Once that is done, we can output an empty list when on
     # CPU.
 
-    def _load_variables():
-      if self._using_tpu and self._built:
-        self._load_variables()
-
-    def _retrieve_variables():
-      if self._using_tpu and self._built:
-        self._retrieve_variables()
-
     def factory(name=_HOOK_KEY):
-      return TPUEmbeddingSaveable(name, _load_variables, _retrieve_variables)
+      return TPUEmbeddingSaveable(name, self._load_variables,
+                                  self._retrieve_variables)
     return {_HOOK_KEY: factory}
 
   # Some helper functions for the below enqueue function.
@@ -1316,6 +1282,75 @@ class TPUEmbedding(tracking.AutoTrackable):
     return batch_size
 
 
+@def_function.function
+def _load_variables_impl(config, hosts, variables, table_config):
+  """Load embedding tables to onto TPU for each table and host.
+
+  Args:
+    config: A serialized TPUEmbeddingConfiguration proto.
+    hosts: A list of CPU devices, on per host.
+    variables: A dictionary of dictionaries of TPUShardedVariables. First key is
+      the table name, second key is 'parameters' or the optimizer slot name.
+    table_config: A list of tf.tpu.experimental.embedding.TableConfig objects.
+  """
+  def select_fn(host_id):
+    return lambda x: x.variables[host_id]
+
+  for host_id, host in enumerate(hosts):
+    host_variables = nest.map_structure(select_fn(host_id), variables)
+    with ops.device(host):
+      for table in table_config:
+        table.optimizer._load()(  # pylint: disable=protected-access
+            table_name=table.name,
+            num_shards=len(hosts),
+            shard_id=host_id,
+            config=config,
+            **host_variables[table.name])
+        # Ensure that only the first table/first host gets a config so that we
+        # don't bloat graph by attaching this large string to each op.
+        # We have num tables * num hosts of these so for models with a large
+        # number of tables training on a large slice, this can be an issue.
+        config = None
+
+
+@def_function.function
+def _retrieve_variables_impl(config, hosts, variables, table_config):
+  """Retrieve embedding tables from TPU to host memory.
+
+  Args:
+    config: A serialized TPUEmbeddingConfiguration proto.
+    hosts: A list of all the host CPU devices.
+    variables: A dictionary of dictionaries of TPUShardedVariables. First key is
+      the table name, second key is 'parameters' or the optimizer slot name.
+    table_config: A list of tf.tpu.experimental.embedding.TableConfig objects.
+  """
+  for host_id, host in enumerate(hosts):
+    with ops.device(host):
+      for table in table_config:
+        retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
+            table_name=table.name,
+            num_shards=len(hosts),
+            shard_id=host_id,
+            config=config)
+        # When there are no slot variables (e.g with SGD) this returns a
+        # single tensor rather than a tuple. In this case we put the tensor in
+        # a list to make the following code easier to write.
+        if not isinstance(retrieved, tuple):
+          retrieved = (retrieved,)
+
+        for i, slot in enumerate(["parameters"] +
+                                 table.optimizer._slot_names()):  # pylint: disable=protected-access
+          # We must assign the CPU variables the values of tensors that were
+          # returned from the TPU.
+          variables[table.name][slot].variables[host_id].assign(
+              retrieved[i])
+        # Ensure that only the first table/first host gets a config so that we
+        # don't bloat graph by attaching this large string to each op.
+        # We have num tables * num hosts of these so for models with a large
+        # number of tables training on a large slice, this can be an issue.
+        config = None
+
+
 class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
   """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
 
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index c3b7b94b8c8..e02e741070a 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -102,6 +102,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_gpu_only
   @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_tfrt('Grappler rewrite doesn\'t apply to tfrt.')
   def test_grappler_pass_enabled(self):
     opt = gradient_descent_v1.GradientDescentOptimizer(1.0)
     enable_mixed_precision_graph_rewrite(opt, 123.)
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 332cc4018ac..b6313f5a1da 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -47,10 +47,12 @@ class MomentumOptimizerTest(test.TestCase):
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       if use_resource:
-        var0 = resource_variable_ops.ResourceVariable(
-            [1.0, 2.0], dtype=dtype, name="var0_%d" % i)
-        var1 = resource_variable_ops.ResourceVariable(
-            [3.0, 4.0], dtype=dtype, name="var1_%d" % i)
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1_%d" % i)
       else:
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
@@ -63,8 +65,7 @@ class MomentumOptimizerTest(test.TestCase):
         momentum = momentum()
       mom_opt = momentum_lib.MomentumOptimizer(
           learning_rate=learning_rate, momentum=momentum)
-      mom_update = mom_opt.apply_gradients(
-          zip([grads0, grads1], [var0, var1]))
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
@@ -87,14 +88,13 @@ class MomentumOptimizerTest(test.TestCase):
       if not context.executing_eagerly():
         self.evaluate(mom_update)
       # Check that the momentum accumulators have been updated.
-      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
-                                         self.evaluate(slot0))
-      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
-                                         self.evaluate(slot1))
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.01, 0.01]), self.evaluate(slot1))
       # Check that the parameters have been updated.
       self.assertAllCloseAccordingToType(
-          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-          self.evaluate(var0))
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0))
       self.assertAllCloseAccordingToType(
           np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
           self.evaluate(var1))
@@ -118,8 +118,8 @@ class MomentumOptimizerTest(test.TestCase):
           ]), self.evaluate(var0))
       self.assertAllCloseAccordingToType(
           np.array([
-              2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                  (0.9 * 0.01 + 0.01) * 2.0)
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
           ]), self.evaluate(var1))
 
   def testBasic(self):
@@ -137,10 +137,12 @@ class MomentumOptimizerTest(test.TestCase):
   def testVariablesAcrossGraphs(self):
     optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
     with ops.Graph().as_default():
-      var0 = resource_variable_ops.ResourceVariable(
-          [1.0, 2.0], dtype=dtypes.float32, name="var0")
-      var1 = resource_variable_ops.ResourceVariable(
-          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var0")
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var1")
       loss = math_ops.reduce_sum(var0 + var1)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
@@ -149,10 +151,12 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertEqual(2, len(optimizer_variables))
 
     with ops.Graph().as_default():
-      var2 = resource_variable_ops.ResourceVariable(
-          [1.0, 2.0], dtype=dtypes.float32, name="var2")
-      var3 = resource_variable_ops.ResourceVariable(
-          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      var2 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var2")
+      var3 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                    dtype=dtypes.float32,
+                                                    name="var3")
       loss = math_ops.reduce_sum(var2 + var3)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
@@ -181,9 +185,8 @@ class MomentumOptimizerTest(test.TestCase):
           opt_op.run()
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
           self.assertAllClose(var0_np, self.evaluate(var0))
           self.assertAllClose(var1_np, self.evaluate(var1))
 
@@ -200,32 +203,29 @@ class MomentumOptimizerTest(test.TestCase):
           grads.append(var0_np * 10)
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         var0 = variables.Variable(var0_np)
         var1 = variables.Variable(var1_np)
-        loss = 5 * var0 * var0 + 3 * var1
         mom_op = momentum_lib.MomentumOptimizer(
             learning_rate=2.0, momentum=0.9, use_nesterov=True)
         x_feed = array_ops.placeholder(dtype)
-        y_feed = ops.IndexedSlices(
-            x_feed, constant_op.constant([0, 1]), constant_op.constant([2]))
-        grads_and_vars = [(y_feed, var0), (constant_op.constant(
-            [3.0, 3.0], dtype=dtype), var1)]
+        y_feed = ops.IndexedSlices(x_feed, constant_op.constant([0, 1]),
+                                   constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+                          (constant_op.constant([3.0, 3.0], dtype=dtype), var1)]
         opt_update = mom_op.apply_gradients(grads_and_vars)
         self.evaluate(variables.global_variables_initializer())
         for t in range(1, 5):
           opt_update.run(feed_dict={x_feed: grads[t - 1]})
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
           self.assertAllClose(var0_np, self.evaluate(var0))
           self.assertAllClose(var1_np, self.evaluate(var1))
 
@@ -249,6 +249,7 @@ class MomentumOptimizerTest(test.TestCase):
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
         return pred * pred
+
       # pylint: enable=cell-var-from-loop
 
       opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
@@ -464,15 +465,11 @@ class MomentumOptimizerTest(test.TestCase):
         var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
         var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
         grads0 = ops.IndexedSlices(
-            constant_op.constant(
-                [[.1, .1]], dtype=dtype),
-            constant_op.constant([1]),
-            constant_op.constant([4, 2]))
+            constant_op.constant([[.1, .1]], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([4, 2]))
         grads1 = ops.IndexedSlices(
-            constant_op.constant(
-                [[.01, .01], [.01, .01]], dtype=dtype),
-            constant_op.constant([2, 3]),
-            constant_op.constant([4, 2]))
+            constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]), constant_op.constant([4, 2]))
         mom_opt = momentum_lib.MomentumOptimizer(
             learning_rate=2.0, momentum=0.9)
         mom_update = mom_opt.apply_gradients(
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index ffc43964fb4..370b78c84f5 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -146,6 +146,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/saved_model:utils",
         "//tensorflow/python/training/saving:checkpoint_options",
         "//tensorflow/python/training/saving:functional_saver",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -184,6 +185,7 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/training/saving:checkpoint_options",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
index 1cf84023b1c..6aeb41b47a9 100644
--- a/tensorflow/python/training/tracking/graph_view.py
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -142,7 +142,7 @@ def _serialize_slot_variables(trackable_objects, node_ids, object_names):
 class ObjectGraphView(object):
   """Gathers and serializes an object graph."""
 
-  def __init__(self, root, saveables_cache=None):
+  def __init__(self, root, saveables_cache=None, attached_dependencies=None):
     """Configure the graph view.
 
     Args:
@@ -151,16 +151,24 @@ class ObjectGraphView(object):
       saveables_cache: A dictionary mapping `Trackable` objects ->
         attribute names -> SaveableObjects, used to avoid re-creating
         SaveableObjects when graph building.
+      attached_dependencies: Dependencies to attach to the root object. Used
+        when saving a Checkpoint with a defined root object.
     """
     self._root_ref = root
     self._saveables_cache = saveables_cache
+    self._attached_dependencies = attached_dependencies
 
   def list_dependencies(self, obj):
     # pylint: disable=protected-access
     obj._maybe_initialize_trackable()
-    return obj._checkpoint_dependencies
+    dependencies = obj._checkpoint_dependencies
     # pylint: enable=protected-access
 
+    if obj is self.root and self._attached_dependencies:
+      dependencies = dependencies.copy()
+      dependencies.extend(self._attached_dependencies)
+    return dependencies
+
   @property
   def saveables_cache(self):
     """Maps Trackable objects -> attribute names -> list(SaveableObjects).
@@ -173,6 +181,19 @@ class ObjectGraphView(object):
     """
     return self._saveables_cache
 
+  @property
+  def attached_dependencies(self):
+    """Returns list of dependencies that should be saved in the checkpoint.
+
+    These dependencies are not tracked by root, but are in the the checkpoint.
+    This is defined when the user creates a Checkpoint with both root and kwargs
+    set.
+
+    Returns:
+      A list of TrackableReferences.
+    """
+    return self._attached_dependencies
+
   @property
   def root(self):
     if isinstance(self._root_ref, weakref.ref):
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 8a27cc37cb2..6b8bf3bd19d 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import functools
-import weakref
 
 from absl import logging
 
@@ -357,100 +355,5 @@ class Asset(base.Trackable):
     return self._path
 
 
-def cached_per_instance(f):
-  """Lightweight decorator for caching lazily constructed properties.
-
-  When to use:
-  This decorator provides simple caching with minimal overhead. It is designed
-  for properties which are expensive to compute and static over the life of a
-  class instance, and provides no mechanism for cache invalidation. Thus it is
-  best suited for lazily exposing derived properties of other static data.
-
-  For classes with custom getattr / setattr behavior (such as trackable
-  objects), storing cache results as object attributes is not performant.
-  Instead, a specialized cache can significantly reduce property lookup
-  overhead. (While still allowing the decorated property to be lazily computed.)
-  Consider the following class:
-
-  ```
-  class MyClass(object):
-    def __setattr__(self, key, value):
-      # Some expensive class specific code
-      # ...
-      # ...
-
-      super(MyClass, self).__setattr__(key, value)
-
-    @property
-    def thing(self):
-      # `thing` is expensive to compute (and may not even be requested), so we
-      # want to lazily compute it and then cache it.
-      output = getattr(self, '_thing', None)
-      if output is None:
-        self._thing = output = compute_thing(self)
-      return output
-  ```
-
-  It's also worth noting that ANY overriding of __setattr__, even something as
-  simple as:
-  ```
-    def __setattr__(self, key, value):
-      super(MyClass, self).__setattr__(key, value)
-  ```
-
-  Slows down attribute assignment by nearly 10x.
-
-  By contrast, replacing the definition of `thing` with the following sidesteps
-  the expensive __setattr__ altogether:
-
-  '''
-  @property
-  @tracking.cached_per_instance
-  def thing(self):
-    # `thing` is expensive to compute (and may not even be requested), so we
-    # want to lazily compute it and then cache it.
-    return compute_thing(self)
-  '''
-
-  Performance:
-  The overhead for this decorator is ~0.4 us / call. A much lower overhead
-  implementation (~0.085 us / call) can be achieved by using a custom dict type:
-
-  ```
-  def dict_based_cache(f):
-    class Cache(dict):
-      __slots__ = ()
-      def __missing__(self, key):
-        self[key] = output = f(key)
-        return output
-
-    return property(Cache().__getitem__)
-  ```
-
-  However, that implementation holds class instances as keys, and as a result
-  blocks garbage collection. (And modifying it to use weakref's as keys raises
-  the lookup overhead to ~0.4 us) As a result, the WeakKeyDictionary
-  implementation below turns out to be more prudent.
-
-  Args:
-    f: The function to cache.
-
-  Returns:
-    f decorated with simple caching behavior.
-  """
-
-  cache = weakref.WeakKeyDictionary()
-
-  @functools.wraps(f)
-  def wrapped(item):
-    output = cache.get(item)
-    if output is None:
-      cache[item] = output = f(item)
-    return output
-
-  wrapped.cache = cache
-  return wrapped
-
-
 ops.register_tensor_conversion_function(
     Asset, lambda asset, **kw: ops.convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/training/tracking/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
index e2b01964bb3..3d6be8c0f4b 100644
--- a/tensorflow/python/training/tracking/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -16,13 +16,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import contextlib
-import multiprocessing.dummy
 import os
-import pickle
-import time
-import timeit
 
 import numpy as np
 
@@ -35,23 +29,6 @@ from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
 
 
-_PICKLEABLE_CALL_COUNT = collections.Counter()
-
-
-class MyPickleableObject(tracking.AutoTrackable):
-  """Needed for InterfaceTests.test_property_cache_serialization.
-
-  This class must be at the top level. This is a constraint of pickle,
-  unrelated to `cached_per_instance`.
-  """
-
-  @property
-  @tracking.cached_per_instance
-  def my_id(self):
-    _PICKLEABLE_CALL_COUNT[self] += 1
-    return id(self)
-
-
 class InterfaceTests(test.TestCase):
 
   def testMultipleAssignment(self):
@@ -169,120 +146,6 @@ class InterfaceTests(test.TestCase):
     self.assertAllClose({"k": [np.ones([2, 2]), np.zeros([3, 3])]},
                         self.evaluate(a.tensors))
 
-  def test_property_cache(self):
-    test_counter = collections.Counter()
-
-    class MyObject(tracking.AutoTrackable):
-
-      def __init__(self):
-        super(MyObject, self).__init__()
-        self._frozen = True
-
-      def __setattr__(self, key, value):
-        """Enforce that cache does not set attribute on MyObject."""
-        if getattr(self, "_frozen", False):
-          raise ValueError("Cannot mutate when frozen.")
-        return super(MyObject, self).__setattr__(key, value)
-
-      @property
-      @tracking.cached_per_instance
-      def test_property(self):
-        test_counter[id(self)] += 1
-        return id(self)
-
-    first_object = MyObject()
-    second_object = MyObject()
-
-    # Make sure the objects return the correct values
-    self.assertEqual(first_object.test_property, id(first_object))
-    self.assertEqual(second_object.test_property, id(second_object))
-
-    # Make sure the cache does not share across objects
-    self.assertNotEqual(first_object.test_property, second_object.test_property)
-
-    # Check again (Now the values should be cached.)
-    self.assertEqual(first_object.test_property, id(first_object))
-    self.assertEqual(second_object.test_property, id(second_object))
-
-    # Count the function calls to make sure the cache is actually being used.
-    self.assertAllEqual(tuple(test_counter.values()), (1, 1))
-
-  def test_property_cache_threaded(self):
-    call_count = collections.Counter()
-
-    class MyObject(tracking.AutoTrackable):
-
-      @property
-      @tracking.cached_per_instance
-      def test_property(self):
-        # Random sleeps to ensure that the execution thread changes
-        # mid-computation.
-        call_count["test_property"] += 1
-        time.sleep(np.random.random() + 1.)
-
-        # Use a RandomState which is seeded off the instance's id (the mod is
-        # because numpy limits the range of seeds) to ensure that an instance
-        # returns the same value in different threads, but different instances
-        # return different values.
-        return int(np.random.RandomState(id(self) % (2 ** 31)).randint(2 ** 16))
-
-      def get_test_property(self, _):
-        """Function provided to .map for threading test."""
-        return self.test_property
-
-    # Test that multiple threads return the same value. This requires that
-    # the underlying function is repeatable, as cached_property makes no attempt
-    # to prioritize the first call.
-    test_obj = MyObject()
-    with contextlib.closing(multiprocessing.dummy.Pool(32)) as pool:
-      # Intentionally make a large pool (even when there are only a small number
-      # of cpus) to ensure that the runtime switches threads.
-      results = pool.map(test_obj.get_test_property, range(64))
-    self.assertEqual(len(set(results)), 1)
-
-    # Make sure we actually are testing threaded behavior.
-    self.assertGreater(call_count["test_property"], 1)
-
-    # Make sure new threads still cache hit.
-    with contextlib.closing(multiprocessing.dummy.Pool(2)) as pool:
-      start_time = timeit.default_timer()  # Don't time pool instantiation.
-      results = pool.map(test_obj.get_test_property, range(4))
-    total_time = timeit.default_timer() - start_time
-
-    # Note(taylorrobie): The reason that it is safe to time a unit test is that
-    #                    a cache hit will be << 1 second, and a cache miss is
-    #                    guaranteed to be >= 1 second. Empirically confirmed by
-    #                    100,000 runs with no flakes.
-    self.assertLess(total_time, 0.95)
-
-  def test_property_cache_serialization(self):
-    # Reset call count. .keys() must be wrapped in a list, because otherwise we
-    # would mutate the iterator while iterating.
-    for k in list(_PICKLEABLE_CALL_COUNT.keys()):
-      _PICKLEABLE_CALL_COUNT.pop(k)
-
-    first_instance = MyPickleableObject()
-    self.assertEqual(id(first_instance), first_instance.my_id)
-
-    # Test that we can pickle and un-pickle
-    second_instance = pickle.loads(pickle.dumps(first_instance))
-
-    self.assertEqual(id(second_instance), second_instance.my_id)
-    self.assertNotEqual(first_instance.my_id, second_instance.my_id)
-
-    # Make sure de-serialized object uses the cache.
-    self.assertEqual(_PICKLEABLE_CALL_COUNT[second_instance], 1)
-
-    # Make sure the decorator cache is not being serialized with the object.
-    expected_size = len(pickle.dumps(second_instance))
-    for _ in range(5):
-      # Add some more entries to the cache.
-      _ = MyPickleableObject().my_id
-    self.assertEqual(len(_PICKLEABLE_CALL_COUNT), 7)
-    size_check_instance = MyPickleableObject()
-    _ = size_check_instance.my_id
-    self.assertEqual(expected_size, len(pickle.dumps(size_check_instance)))
-
 
 class _DummyResource(tracking.TrackableResource):
 
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index bf05b6ff74c..95c8f9d2b60 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -40,7 +40,9 @@ from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training import saver as v1_saver_lib
@@ -395,7 +397,7 @@ class _NameBasedRestoreCoordinator(object):
             restored_tensors=restored_tensors, restored_shapes=None)
 
 
-# TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
+# TODO(allenl): If this ends up in a public API, consider adding LINT.If Change
 # or consolidating the implementation with get_variable.
 def _default_getter(name,
                     shape,
@@ -1325,6 +1327,30 @@ class TrackableSaver(object):
         options=options)
     base.CheckpointPosition(
         checkpoint=checkpoint, proto_id=0).restore(self._graph_view.root)
+
+    # Attached dependencies are not attached to the root, so should be restored
+    # separately.
+    if self._graph_view.attached_dependencies:
+      for ref in self._graph_view.attached_dependencies:
+        if ref.name == "root":
+          # Root dependency is automatically added to attached dependencies --
+          # this can be ignored since it maps back to the root object.
+          continue
+        proto_id = None
+        # Find proto ID of attached dependency (if it is in the proto).
+        for proto_ref in object_graph_proto.nodes[0].children:
+          if proto_ref.local_name == ref.name:
+            proto_id = proto_ref.node_id
+            break
+
+        if proto_id in checkpoint.object_by_proto_id:
+          # Object has already been restored. This can happen when there's an
+          # indirect connection from the attached object to the root.
+          continue
+
+        base.CheckpointPosition(
+            checkpoint=checkpoint, proto_id=proto_id).restore(ref.ref)
+
     load_status = CheckpointLoadStatus(
         checkpoint,
         graph_view=self._graph_view,
@@ -1358,7 +1384,7 @@ def frozen_saver(root_trackable):
   return functional_saver.MultiDeviceSaver(named_saveable_objects)
 
 
-def saver_with_op_caching(obj):
+def saver_with_op_caching(obj, attached_dependencies=None):
   """A TrackableSaver with a SaveableObject cache when graph building."""
   if context.executing_eagerly():
     saveables_cache = None
@@ -1366,7 +1392,19 @@ def saver_with_op_caching(obj):
     saveables_cache = object_identity.ObjectIdentityWeakKeyDictionary()
   return TrackableSaver(
       graph_view_lib.ObjectGraphView(
-          weakref.ref(obj), saveables_cache=saveables_cache))
+          weakref.ref(obj), saveables_cache=saveables_cache,
+          attached_dependencies=attached_dependencies))
+
+
+def _assert_trackable(obj):
+  if not isinstance(
+      obj, (base.Trackable, def_function.Function)):
+    raise ValueError(
+        "`Checkpoint` was expecting a trackable object (an object "
+        "derived from `TrackableBase`), got {}. If you believe this "
+        "object should be trackable (i.e. it is part of the "
+        "TensorFlow Python API and manages state), please open an issue."
+        .format(obj))
 
 
 # Mentions graph building / Sessions. The v2 version is below.
@@ -1737,15 +1775,32 @@ class CheckpointV1(tracking.AutoTrackable):
 
 @tf_export("train.Checkpoint", v1=[])
 class Checkpoint(tracking.AutoTrackable):
-  """Groups trackable objects, saving and restoring them.
+  """Manages saving/restoring trackable values to disk.
 
-  `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain trackable state, such as `tf.keras.optimizers.Optimizer`
-  implementations, `tf.Variable`s, `tf.data.Dataset` iterators, `tf.keras.Layer`
-  implementations, or `tf.keras.Model` implementations. It saves these values
-  with a checkpoint, and maintains a `save_counter` for numbering checkpoints.
+  TensorFlow objects may contain trackable state, such as `tf.Variable`s,
+  `tf.keras.optimizers.Optimizer` implementations, `tf.data.Dataset` iterators,
+  `tf.keras.Layer` implementations, or  `tf.keras.Model` implementations.
+  These are called **trackable objects**.
 
-  Example usage:
+  A `Checkpoint` object can be constructed to save either a single or group of
+  trackable objects to a checkpoint file. It maintains a `save_counter` for
+  numbering checkpoints.
+
+  Example:
+
+  ```python
+  model = tf.keras.Model(...)
+  checkpoint = tf.train.Checkpoint(model)
+
+  # Save a checkpoint to /tmp/training_checkpoints-{save_counter}. Every time
+  # checkpoint.save is called, the save counter is increased.
+  save_path = checkpoint.save('/tmp/training_checkpoints')
+
+  # Restore the checkpointed values to the `model` object.
+  checkpoint.restore(save_path)
+  ```
+
+  Example 2:
 
   ```python
   import tensorflow as tf
@@ -1805,45 +1860,79 @@ class Checkpoint(tracking.AutoTrackable):
   as a single checkpoint. This avoids copying all variables to one worker, but
   does require that all workers see a common filesystem.
 
-  While `tf.keras.Model.save_weights` and `tf.train.Checkpoint.save` save in the
-  same format, note that the root of the resulting checkpoint is the object the
-  save method is attached to. This means saving a `tf.keras.Model` using
-  `save_weights` and loading into a `tf.train.Checkpoint` with a `Model`
-  attached (or vice versa) will not match the `Model`'s variables. See the
-  [guide to training
+  This function differs slightly from the Keras Model `save_weights` function.
+  `tf.keras.Model.save_weights` creates a checkpoint file with the name
+  specified in `filepath`, while `tf.train.Checkpoint` numbers the checkpoints,
+  using `filepath` as the prefix for the checkpoint file names. Aside from this,
+  `model.save_weights()` and `tf.train.Checkpoint(model).save()` are equivalent.
+
+  See the [guide to training
   checkpoints](https://www.tensorflow.org/guide/checkpoint) for
-  details. Prefer `tf.train.Checkpoint` over `tf.keras.Model.save_weights` for
-  training checkpoints.
+  details.
 
   Attributes:
     save_counter: Incremented when `save()` is called. Used to number
       checkpoints.
   """
 
-  def __init__(self, **kwargs):
-    """Group objects into a training checkpoint.
+  def __init__(self, root=None, **kwargs):
+    """Creates a training checkpoint for a single or group of objects.
 
     Args:
+      root: The root object to checkpoint.
       **kwargs: Keyword arguments are set as attributes of this object, and are
         saved with the checkpoint. Values must be trackable objects.
 
     Raises:
-      ValueError: If objects in `kwargs` are not trackable.
+      ValueError: If `root` or the objects in `kwargs` are not trackable. A
+        `ValueError` is also raised if the `root` object tracks different
+        objects from the ones listed in attributes in kwargs (e.g.
+        `root.child = A` and `tf.train.Checkpoint(root, child=B)` are
+        incompatible).
+
     """
     super(Checkpoint, self).__init__()
-    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      setattr(self, k, v)
-      if not isinstance(
-          getattr(self, k), (base.Trackable, def_function.Function)):
-        raise ValueError(
-            ("`Checkpoint` was expecting a trackable object (an object "
-             "derived from `TrackableBase`), got %s. If you believe this "
-             "object should be trackable (i.e. it is part of the "
-             "TensorFlow Python API and manages state), please open an issue.")
-            % (v,))
+
+    saver_root = self
+    attached_dependencies = None
     self._save_counter = None  # Created lazily for restore-on-create.
     self._save_assign_op = None
-    self._saver = saver_with_op_caching(self)
+
+    if root:
+      _assert_trackable(root)
+      saver_root = root
+      attached_dependencies = []
+
+      # All keyword arguments (including root itself) are set as children
+      # of root.
+      kwargs["root"] = root
+      root._maybe_initialize_trackable()
+
+      self._save_counter = data_structures.NoDependency(
+          root._lookup_dependency("save_counter"))
+      self._root = data_structures.NoDependency(root)
+
+    for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
+      setattr(self, k, v)
+
+      # Call getattr instead of directly using v because setattr converts
+      # v to a Trackable data structure when v is a list/dict/tuple.
+      converted_v = getattr(self, k)
+      _assert_trackable(converted_v)
+
+      if root:
+        # Make sure that root doesn't already have dependencies with these names
+        child = root._lookup_dependency(k)
+        if child is None:
+          attached_dependencies.append(base.TrackableReference(k, converted_v))
+        elif child != converted_v:
+          raise ValueError(
+              "Cannot create a Checkpoint with keyword argument {name} if "
+              "root.{name} already exists.".format(name=k))
+
+    self._saver = saver_with_op_caching(saver_root, attached_dependencies)
+    self._attached_dependencies = data_structures.NoDependency(
+        attached_dependencies)
 
   def _maybe_create_save_counter(self):
     """Create a save counter if it does not yet exist."""
@@ -1859,6 +1948,15 @@ class Checkpoint(tracking.AutoTrackable):
                 initializer=0,
                 dtype=dtypes.int64,
                 trainable=False))
+        if self._attached_dependencies is not None:
+          self._attached_dependencies.append(
+              base.TrackableReference("save_counter", self._save_counter))
+          # When loading a checkpoint, the save counter is created after
+          # the checkpoint has been loaded, so it must be handled in a deferred
+          # manner.
+          restore = self.root._deferred_dependencies.pop("save_counter", ())  # pylint: disable=protected-access
+          if restore:
+            restore[0].restore(self._save_counter)
 
   def write(self, file_prefix, options=None):
     """Writes a training checkpoint.
@@ -2074,15 +2172,32 @@ class Checkpoint(tracking.AutoTrackable):
     a matching Python object.
 
     Name-based `tf.compat.v1.train.Saver` checkpoints from TensorFlow 1.x can be
-    loaded
-    using this method. Names are used to match variables. Re-encode name-based
-    checkpoints using `tf.train.Checkpoint.save` as soon as possible.
+    loaded using this method. Names are used to match variables. Re-encode
+    name-based checkpoints using `tf.train.Checkpoint.save` as soon as possible.
+
+    **Loading from SavedModel checkpoints**
+
+    To load values from a SavedModel, just pass the SavedModel directory
+    to checkpoint.restore:
+
+    ```python
+    model = tf.keras.Model(...)
+    tf.saved_model.save(model, path)  # or model.save(path, save_format='tf')
+
+    checkpoint = tf.train.Checkpoint(model)
+    checkpoint.restore(path).expect_partial()
+    ```
+
+    This example calls `expect_partial()` on the loaded status, since
+    SavedModels saved from Keras often generates extra keys in the checkpoint.
+    Otherwise, the program prints a lot of warnings about unused keys at exit
+    time.
 
     Args:
       save_path: The path to the checkpoint, as returned by `save` or
         `tf.train.latest_checkpoint`. If the checkpoint was written by the
         name-based `tf.compat.v1.train.Saver`, names are used to match
-        variables.
+        variables. This path may also be a SavedModel directory.
       options: Optional `tf.train.CheckpointOptions` object.
 
     Returns:
@@ -2121,8 +2236,25 @@ class Checkpoint(tracking.AutoTrackable):
           restores. Warnings are otherwise printed for unused parts of the
           checkpoint file or object when the `Checkpoint` object is deleted
           (often at program shutdown).
+
+    Raises:
+      NotFoundError: if the a checkpoint or SavedModel cannot be found at
+        `save_path`.
     """
-    status = self.read(save_path, options=options)
+    orig_save_path = save_path
+
+    if save_path is not None and gfile.IsDirectory(save_path) and (
+        (gfile.Exists(utils_impl.get_saved_model_pb_path(save_path)) or
+         gfile.Exists(utils_impl.get_saved_model_pbtxt_path(save_path)))):
+      save_path = utils_impl.get_variables_path(save_path)
+
+    try:
+      status = self.read(save_path, options=options)
+    except errors_impl.NotFoundError:
+      raise errors_impl.NotFoundError(
+          None, None,
+          "Could not find checkpoint or SavedModel at {}."
+          .format(orig_save_path))
     # Create the save counter now so it gets initialized with other variables
     # when graph building. Creating it earlier would lead to errors when using,
     # say, train.Saver() to save the model before initializing it.
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 4ef5f63380b..38a1e9a59fa 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.saving import checkpoint_options
@@ -794,6 +796,101 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertAllClose(self.evaluate(load_checkpoint.a), [0, 1])
     self.assertAllClose(self.evaluate(load_checkpoint.b), {"a": 2, "b": 3})
 
+  def _create_trackable(self):
+    class Model(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = variables_lib.Variable(2.)
+
+      def __call__(self, x):
+        return self.v * x
+    return Model()
+
+  def test_initialize_with_root_object(self):
+    model = self._create_trackable()
+    input_value = constant_op.constant([[3.]])
+    expected_output = self.evaluate(model(input_value))
+    model.deferred_variable = variables_lib.Variable(5.)
+
+    checkpoint = trackable_utils.Checkpoint(model)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    new_model = self._create_trackable()
+    load_checkpoint = trackable_utils.Checkpoint(new_model)
+    load_checkpoint.restore(save_path)
+    self.assertAllClose(expected_output, new_model(input_value))
+
+    new_model.deferred_variable = variables_lib.Variable(1.)
+    self.assertEqual(self.evaluate(new_model.deferred_variable), 5)
+
+  def test_initialize_with_root_object_and_kwargs(self):
+    model = self._create_trackable()
+    model.v.assign(3.)
+    separate_variable = variables_lib.Variable(5.)
+
+    with self.assertRaisesRegex(ValueError, "root.v already exists"):
+      trackable_utils.Checkpoint(model, v=separate_variable)
+
+    checkpoint = trackable_utils.Checkpoint(
+        model, separate_variable=separate_variable)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    # Case 1: Loading checkpoint with same configuration.
+    new_model = self._create_trackable()
+    separate_variable = variables_lib.Variable(1.)
+    load_checkpoint = trackable_utils.Checkpoint(
+        new_model, separate_variable=separate_variable)
+    load_checkpoint.restore(save_path).assert_consumed()
+    self.assertEqual(self.evaluate(new_model.v), 3)
+    self.assertEqual(self.evaluate(separate_variable), 5)
+    self.assertEqual(self.evaluate(load_checkpoint.save_counter), 1)
+
+    # Case 2: Loading checkpoint where v and separate_variable are swapped:
+    # v is not attached to the root, while separate variable is attached to root
+    new_model = tracking.AutoTrackable()
+    new_model.separate_variable = variables_lib.Variable(200.)
+    v = variables_lib.Variable(100.)
+    load_checkpoint = trackable_utils.Checkpoint(new_model, v=v)
+    load_checkpoint.restore(save_path).assert_consumed()
+    self.assertEqual(self.evaluate(v), 3)
+    self.assertEqual(self.evaluate(new_model.separate_variable), 5)
+    self.assertEqual(self.evaluate(load_checkpoint.save_counter), 1)
+
+    # Case 3: Loading checkpoint where no root object is specified
+    separate_variable = variables_lib.Variable(200.)
+    v = variables_lib.Variable(100.)
+    load_checkpoint = trackable_utils.Checkpoint(
+        v=v, separate_variable=separate_variable)
+    load_checkpoint.restore(save_path).assert_consumed()
+    self.assertEqual(self.evaluate(v), 3)
+    self.assertEqual(self.evaluate(new_model.separate_variable), 5)
+    self.assertEqual(self.evaluate(load_checkpoint.save_counter), 1)
+
+  def test_checkpoint_saved_model_compatibility(self):
+    model = self._create_trackable()
+    input_value = constant_op.constant([[3.]])
+    expected_output = self.evaluate(model(input_value))
+    model.deferred_variable = variables_lib.Variable(5.)
+    saved_model_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    saved_model_save.save(model, saved_model_dir)
+
+    new_model = self._create_trackable()
+    load_checkpoint = trackable_utils.Checkpoint(new_model)
+
+    with self.assertRaisesRegex(errors_impl.NotFoundError,
+                                "Could not find checkpoint or SavedModel"):
+      load_checkpoint.restore(saved_model_dir + "no").expect_partial()
+
+    load_checkpoint.restore(saved_model_dir).expect_partial()
+    self.assertAllClose(expected_output, new_model(input_value))
+
+    new_model.deferred_variable = variables_lib.Variable(1.)
+    self.assertEqual(self.evaluate(new_model.deferred_variable), 5)
+
 
 class TemplateTests(parameterized.TestCase, test.TestCase):
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 66f43a3d682..9f4ae1d9670 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -335,6 +335,9 @@ def flatten(structure, expand_composites=False):
   Raises:
     TypeError: The nest is or contains a dict with non-sortable keys.
   """
+  if structure is None:
+    return [None]
+  expand_composites = bool(expand_composites)
   return _pywrap_utils.Flatten(structure, expand_composites)
 
 
@@ -392,6 +395,10 @@ def assert_same_structure(nest1, nest2, check_types=True,
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
+  # Convert to bool explicitly as otherwise pybind will not be able# to handle
+  # type mismatch message correctly. See GitHub issue 42329 for details.
+  check_types = bool(check_types)
+  expand_composites = bool(expand_composites)
   try:
     _pywrap_utils.AssertSameStructure(nest1, nest2, check_types,
                                       expand_composites)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index fb3f2102ba7..7f8bb247792 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -1218,6 +1218,18 @@ class NestTest(parameterized.TestCase, test.TestCase):
         expected,
     )
 
+  def testInvalidCheckTypes(self):
+    with self.assertRaises((ValueError, TypeError)):
+      nest.assert_same_structure(
+          nest1=array_ops.zeros((1)),
+          nest2=array_ops.ones((1, 1, 1)),
+          check_types=array_ops.ones((2)))
+    with self.assertRaises((ValueError, TypeError)):
+      nest.assert_same_structure(
+          nest1=array_ops.zeros((1)),
+          nest2=array_ops.ones((1, 1, 1)),
+          expand_composites=array_ops.ones((2)))
+
 
 class NestBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/util/tf_stack.cc b/tensorflow/python/util/tf_stack.cc
index aa9be6305ce..7f5ff7ff8ae 100644
--- a/tensorflow/python/util/tf_stack.cc
+++ b/tensorflow/python/util/tf_stack.cc
@@ -127,6 +127,11 @@ PYBIND11_MODULE(_tf_stack, m) {
       // For compatibility with the traceback module.
       .def("__eq__", &FrameSummary::operator==)
       .def("__ne__", &FrameSummary::operator!=)
+      .def("__hash__",
+           [](const FrameSummary& self) {
+             return py::hash(
+                 py::make_tuple(self.filename, self.lineno, self.name));
+           })
       .def("__getitem__",
            [](const FrameSummary& self, const py::object& index) -> py::object {
              return py::make_tuple(self.filename, self.lineno, self.name,
diff --git a/tensorflow/python/util/tf_stack_test.py b/tensorflow/python/util/tf_stack_test.py
index dc5a2a2baa0..07dc2d3f930 100644
--- a/tensorflow/python/util/tf_stack_test.py
+++ b/tensorflow/python/util/tf_stack_test.py
@@ -52,6 +52,17 @@ class TFStackTest(test.TestCase):
     another_frame0, _ = tf_stack.extract_stack(limit=2)
     self.assertEqual(frame0, another_frame0)
 
+  def testFrameSummaryEqualityAndHash(self):
+    # Both defined on the same line to produce identical stacks.
+    frame1, frame2 = tf_stack.extract_stack(), tf_stack.extract_stack()
+    self.assertEqual(len(frame1), len(frame2))
+    for f1, f2 in zip(frame1, frame2):
+      self.assertEqual(f1, f2)
+      self.assertEqual(hash(f1), hash(f1))
+      self.assertEqual(hash(f1), hash(f2))
+    self.assertEqual(frame1, frame2)
+    self.assertEqual(hash(tuple(frame1)), hash(tuple(frame2)))
+
 
 def extract_stack(limit=None):
   # Both defined on the same line to produce identical stacks.
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index 34f98e640d6..f7a756ed84e 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -10,7 +10,7 @@ in [SECURITY.md](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.m
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
-| [TFSA-2020-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-001.md)   | Segmentation fault when converting a Python string to `tf.float16` | >= 12.0, <= 2.1 | (found internally) |  |
+| [TFSA-2020-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-001.md)   | Segmentation fault when converting a Python string to `tf.float16` | >= 1.12.0, <= 2.1 | (found internally) |  |
 | [TFSA-2019-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-002.md)   | Heap buffer overflow in `UnsortedSegmentSum` | <= 1.14 | (found internally) |  |
 | [TFSA-2019-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-001.md)   | Null Pointer Dereference Error in Decoding GIF Files | <= 1.12 | Baidu Security Lab |  |
 | [TFSA-2018-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-006.md)   | Crafted Configuration File results in Invalid Memory Access | <= 1.7 | Blade Team of Tencent |  |
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 6e0113ab05a..70b1ebe070a 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -136,6 +136,10 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
   return GetDsoHandle("rocrand", "");
 }
 
+port::StatusOr<void*> GetHipsparseDsoHandle() {
+  return GetDsoHandle("hipsparse", "");
+}
+
 port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
 
 }  // namespace DsoLoader
@@ -206,6 +210,11 @@ port::StatusOr<void*> GetRocrandDsoHandle() {
   return *result;
 }
 
+port::StatusOr<void*> GetHipsparseDsoHandle() {
+  static auto result = new auto(DsoLoader::GetHipsparseDsoHandle());
+  return *result;
+}
+
 port::StatusOr<void*> GetHipDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipDsoHandle());
   return *result;
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 7eee2e60785..91138f713bd 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -50,6 +50,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipsparseDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();
 
 // The following method tries to dlopen all necessary GPU libraries for the GPU
@@ -82,6 +83,7 @@ port::StatusOr<void*> GetRocblasDsoHandle();
 port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipsparseDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();
 }  // namespace CachedDsoLoader
 
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index bd924125d77..bd4c45382f8 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -277,6 +277,23 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "hipsparse_wrapper",
+    srcs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
+    hdrs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static([
+        "@local_config_rocm//rocm:hiprand",
+    ])),
+    alwayslink = True,
+)
+
 cc_library(
     name = "all_runtime",
     copts = tf_copts(),
diff --git a/tensorflow/stream_executor/rocm/hipsparse_wrapper.h b/tensorflow/stream_executor/rocm/hipsparse_wrapper.h
new file mode 100644
index 00000000000..6444f015cf8
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/hipsparse_wrapper.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps hipsparse API calls with dso loader so that we don't need to
+// have explicit linking to libhipsparse. All TF hipsarse API usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
+
+#include "rocm/include/hipsparse/hipsparse.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace tensorflow {
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define HIPSPARSE_API_WRAPPER(__name)               \
+  struct WrapperShim__##__name {                    \
+    template <typename... Args>                     \
+    hipsparseStatus_t operator()(Args... args) {    \
+      hipSparseStatus_t retval = ::__name(args...); \
+      return retval;                                \
+    }                                               \
+  } __name;
+
+#else
+
+#define HIPSPARSE_API_WRAPPER(__name)                                          \
+  struct DynLoadShim__##__name {                                               \
+    static const char* kName;                                                  \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;               \
+    static void* GetDsoHandle() {                                              \
+      auto s =                                                                 \
+          stream_executor::internal::CachedDsoLoader::GetHipsparseDsoHandle(); \
+      return s.ValueOrDie();                                                   \
+    }                                                                          \
+    static FuncPtrT LoadOrDie() {                                              \
+      void* f;                                                                 \
+      auto s =                                                                 \
+          Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                              \
+                    << " in miopen DSO; dlerror: " << s.error_message();       \
+      return reinterpret_cast<FuncPtrT>(f);                                    \
+    }                                                                          \
+    static FuncPtrT DynLoad() {                                                \
+      static FuncPtrT f = LoadOrDie();                                         \
+      return f;                                                                \
+    }                                                                          \
+    template <typename... Args>                                                \
+    hipsparseStatus_t operator()(Args... args) {                               \
+      return DynLoad()(args...);                                               \
+    }                                                                          \
+  } __name;                                                                    \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+#define FOREACH_HIPSPARSE_API(__macro)		\
+  __macro(hipsparseCreate)			\
+  __macro(hipsparseCreateMatDescr)		\
+  __macro(hipsparseDcsr2csc)			\
+  __macro(hipsparseDcsrgemm)			\
+  __macro(hipsparseDcsrmm2)			\
+  __macro(hipsparseDcsrmv)			\
+  __macro(hipsparseDestroy)			\
+  __macro(hipsparseDestroyMatDescr)		\
+  __macro(hipsparseScsr2csc)			\
+  __macro(hipsparseScsrgemm)			\
+  __macro(hipsparseScsrmm2)			\
+  __macro(hipsparseScsrmv)			\
+  __macro(hipsparseSetStream)			\
+  __macro(hipsparseSetMatIndexBase)		\
+  __macro(hipsparseSetMatType)			\
+  __macro(hipsparseXcoo2csr)			\
+  __macro(hipsparseXcsr2coo)			\
+  __macro(hipsparseXcsrgemmNnz)
+
+// clang-format on
+
+FOREACH_HIPSPARSE_API(HIPSPARSE_API_WRAPPER)
+
+#undef FOREACH_HIPSPARSE_API
+#undef HIPSPARSE_API_WRAPPER
+
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 93998a4aefc..70423d8d878 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -251,6 +251,7 @@ cc_library(
     srcs = ["tpu_computation_placer.cc"],
     hdrs = ["tpu_computation_placer.h"],
     deps = [
+        ":status_helper",
         ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/compiler/xla:statusor",
@@ -308,6 +309,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":tpu_platform_interface",
+        ":tpu_topology_external",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/lib",
     ],
diff --git a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
index 9d8aa3808b3..81cf97a792d 100644
--- a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
+++ b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_computation_placer.h"
 
 #include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 template <typename T>
@@ -37,7 +38,15 @@ StatusOr<int> TpuComputationPlacer::DeviceId(int replica, int computation,
 
 StatusOr<xla::DeviceAssignment> TpuComputationPlacer::AssignDevices(
     int replica_count, int computation_count) {
-  LOG(FATAL) << "Unimplemented.";
+  StatusHelper status;
+  xla::DeviceAssignment result(replica_count, computation_count);
+  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignDevicesFn(
+      placer_, replica_count, computation_count, result.data(),
+      status.c_status);
+  if (!status.ok()) {
+    return status.status();
+  }
+  return result;
 }
 
 static std::unique_ptr<xla::ComputationPlacer> CreateTpuComputationPlacer() {
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 851fb3ec4e7..166deb716ca 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -80,6 +80,11 @@ Status TpuExecutor::GetStatus(Stream* stream) {
   return status.status();
 }
 
+tpu::TpuCoreLocationExternal TpuExecutor::GetCoreLocationExternal() const {
+  return tpu::TpuCoreLocationExternal(
+      tpu::ExecutorApiFn()->TpuExecutor_GetCoreLocationFn(executor_));
+}
+
 bool TpuExecutor::AllocateStream(Stream* stream) {
   return tpu::ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
       executor_, stream_map().at(stream->implementation()));
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.h b/tensorflow/stream_executor/tpu/tpu_executor.h
index 2430a350463..1ba2e4f587d 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor.h
@@ -100,6 +100,8 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   absl::optional<stream_executor::AllocatorStats> GetAllocatorStats() override;
 
+  tpu::TpuCoreLocationExternal GetCoreLocationExternal() const override;
+
   Status GetStatus(Stream* stream) override;
 
   std::unique_ptr<::stream_executor::internal::StreamInterface>
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index 3280060e28b..622921d0fb6 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -65,6 +65,8 @@ bool TpuExecutor_CreateStreamDependency(SE_StreamExecutor* executor,
 void TpuExecutor_GetStatus(SE_StreamExecutor* executor, SE_Stream* stream,
                            SE_Status* status);
 
+SE_TpuTopology_Core* TpuExecutor_GetCoreLocation(SE_StreamExecutor* executor);
+
 void TpuExecutor_AllocateEvent(SE_StreamExecutor* executor, SE_Event* event,
                                SE_Status* status);
 void TpuExecutor_DeallocateEvent(SE_StreamExecutor* executor, SE_Event* event,
@@ -210,6 +212,13 @@ void TpuTransferManager_ResetDevices(XLA_TransferManager* manager,
 
 XLA_ComputationPlacer* TpuComputationPlacer_New();
 void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
+// `assignment` should be a preallocated array of size `replicate_count` *
+// `computation_count`. The assignment will be constructed as a 2D array where
+// assignment[replica][computation] = device_id.
+void TpuComputationPlacer_AssignDevices(XLA_ComputationPlacer* placer,
+                                        int replica_count,
+                                        int computation_count, int* assignment,
+                                        SE_Status* status);
 
 int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
@@ -304,6 +313,7 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateStream);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateStreamDependency);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetCoreLocation);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateEvent);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateEvent);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PollForEventStatus);
@@ -379,6 +389,7 @@ struct TfTpu_ExecutorApiFn {
 
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignDevices);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerChip);
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_interface.h b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
index d3145b140b8..399a81f8553 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
 
 namespace tpu {
 class TpuCore;
@@ -53,6 +54,10 @@ class TpuExecutorInterface
   }
 
   virtual TpuPlatformInterface& platform() { LOG(FATAL) << "Unimplemented."; }
+
+  virtual TpuCoreLocationExternal GetCoreLocationExternal() const {
+    LOG(FATAL) << "Unimplemented.";
+  }
 };
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
index 28430392117..9b8b9cd8ed5 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
@@ -24,7 +25,14 @@ namespace tensorflow {
 namespace tpu {
 
 namespace {
-TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform) {
+TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform,
+                                                  int tries_left = 5) {
+  if (tries_left <= 0) {
+    LOG(ERROR) << "Unable to find a TPU platform after exhausting all tries. "
+                  "Returning nullptr...";
+    return nullptr;
+  }
+
   // Prefer TpuPlatform if it's registered.
   auto status_or_tpu_platform =
       stream_executor::MultiPlatformManager::PlatformWithName(
@@ -47,21 +55,29 @@ TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform) {
                    nullptr;
           },
           initialize_platform);
-  if (!status_or_other_tpu_platforms.ok()) {
+
+  // If we encounter an error, and it is not because the platform isn't found.
+  if (!status_or_other_tpu_platforms.ok() &&
+      status_or_other_tpu_platforms.status().code() != error::NOT_FOUND) {
     LOG(WARNING) << "Error when getting other TPU platforms: "
-                 << status_or_tpu_platform.status();
+                 << status_or_other_tpu_platforms.status();
     return nullptr;
   }
-  auto other_tpu_platforms = status_or_other_tpu_platforms.ValueOrDie();
-  if (!other_tpu_platforms.empty()) {
+
+  // If we find at least one thing, we return the first thing we see.
+  if (status_or_other_tpu_platforms.ok()) {
+    auto other_tpu_platforms = status_or_other_tpu_platforms.ValueOrDie();
     LOG(WARNING) << other_tpu_platforms.size()
                  << " TPU platforms registered, selecting "
                  << other_tpu_platforms[0]->Name();
     return static_cast<TpuPlatformInterface*>(other_tpu_platforms[0]);
   }
 
-  LOG(WARNING) << "No TPU platform registered";
-  return nullptr;
+  LOG(WARNING)
+      << "No TPU platform registered. Waiting 1 second and trying again... ("
+      << (tries_left - 1) << " tries left)";
+  Env::Default()->SleepForMicroseconds(1000000);  // 1 second
+  return GetRegisteredPlatformStatic(initialize_platform, --tries_left);
 }
 }  // namespace
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 51e26c67e72..36ab1f146ac 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1642,77 +1642,6 @@ def _get_transitive_headers(hdrs, deps):
         transitive = [dep[CcInfo].compilation_context.headers for dep in deps],
     )
 
-# Bazel rules for building swig files.
-def _py_wrap_cc_impl(ctx):
-    srcs = ctx.files.srcs
-    if len(srcs) != 1:
-        fail("Exactly one SWIG source file label must be specified.", "srcs")
-    module_name = ctx.attr.module_name
-    src = ctx.files.srcs[0]
-    inputs = _get_transitive_headers([src] + ctx.files.swig_includes, ctx.attr.deps)
-    inputs = depset(ctx.files._swiglib, transitive = [inputs])
-    inputs = depset(ctx.files.toolchain_deps, transitive = [inputs])
-    swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
-    swig_include_dirs = depset(sorted([f.dirname for f in ctx.files._swiglib]), transitive = [swig_include_dirs])
-    args = [
-        "-c++",
-        "-python",
-        "-module",
-        module_name,
-        "-o",
-        ctx.outputs.cc_out.path,
-        "-outdir",
-        ctx.outputs.py_out.dirname,
-    ]
-    args += ["-l" + f.path for f in ctx.files.swig_includes]
-    args += ["-I" + i for i in swig_include_dirs.to_list()]
-    args.append(src.path)
-    outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
-    ctx.actions.run(
-        executable = ctx.executable._swig,
-        arguments = args,
-        inputs = inputs,
-        outputs = outputs,
-        mnemonic = "PythonSwig",
-        progress_message = "SWIGing " + src.path,
-    )
-    return struct(files = depset(outputs))
-
-_py_wrap_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_files = True,
-        ),
-        "swig_includes": attr.label_list(
-            allow_files = True,
-        ),
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = [CcInfo],
-        ),
-        "toolchain_deps": attr.label_list(
-            allow_files = True,
-        ),
-        "module_name": attr.string(mandatory = True),
-        "py_module_name": attr.string(mandatory = True),
-        "_swig": attr.label(
-            default = Label("@swig//:swig"),
-            executable = True,
-            cfg = "host",
-        ),
-        "_swiglib": attr.label(
-            default = Label("@swig//:templates"),
-            allow_files = True,
-        ),
-    },
-    outputs = {
-        "cc_out": "%{module_name}.cc",
-        "py_out": "%{py_module_name}.py",
-    },
-    implementation = _py_wrap_cc_impl,
-)
-
 def _get_repository_roots(ctx, files):
     """Returns abnormal root directories under which files reside.
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index 00880d3f73b..6257c71809e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
index d26bde73d6e..cd2342fa17b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 4739f586002..37cab1cd949 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
index f6573a08ab1..15548662969 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 7c3a62bb067..96f3f456c22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index ca1ca3678a2..82696611119 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index e91de61a7f5..fa9ff47a9ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 14c5514be31..1f3a3e01534 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 6198572ba4f..40aea957ecb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9fe14ecc611..c23af284169 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
index b71cda0a1be..ac861ce8131 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index e4051585a35..1c8a1071cca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -51,6 +51,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
index ee9351e5bb4..6379a67eadb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 3c5b3a8c3db..fda61393e1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index bf32f07455e..c07a18eb61c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -66,6 +66,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 2bf8383bc30..39e44edf3c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
index 321b7004109..228bfd41be2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-permutation.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "perm"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index a8a7a06fb51..358c0f88659 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
index 15bae49eda0..7f863ce4170 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
index 0609904bbb3..eadb8f066ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-tridiag.pbtxt
@@ -58,6 +58,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 75777dc7745..f905de20b68 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index 2390fb26d9c..c9ee0301612 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -49,6 +49,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
@@ -75,7 +79,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\', \'parameters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_to_tensor"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 81ad89bf2ff..6e228a20111 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1996,6 +1996,10 @@ tf_module {
     name: "IsVariableInitialized"
     argspec: "args=[\'ref\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "IsotonicRegression"
+    argspec: "args=[\'input\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "Iterator"
     argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index 00880d3f73b..6257c71809e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
index d26bde73d6e..cd2342fa17b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 4739f586002..37cab1cd949 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
index f6573a08ab1..15548662969 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-lower-triangular.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 7c3a62bb067..96f3f456c22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index ca1ca3678a2..82696611119 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index e91de61a7f5..fa9ff47a9ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -59,6 +59,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 14c5514be31..1f3a3e01534 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 6198572ba4f..40aea957ecb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9fe14ecc611..c23af284169 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
index b71cda0a1be..ac861ce8131 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-householder.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index e4051585a35..1c8a1071cca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -51,6 +51,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
index ee9351e5bb4..6379a67eadb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operator"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index 3c5b3a8c3db..fda61393e1a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "operators"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index bf32f07455e..c07a18eb61c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -66,6 +66,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 2bf8383bc30..39e44edf3c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
index 321b7004109..228bfd41be2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-permutation.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "perm"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index a8a7a06fb51..358c0f88659 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
index 15bae49eda0..7f863ce4170 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-toeplitz.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
index 0609904bbb3..eadb8f066ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-tridiag.pbtxt
@@ -58,6 +58,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 75777dc7745..f905de20b68 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index 2390fb26d9c..c9ee0301612 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -49,6 +49,10 @@ tf_class {
     name: "name_scope"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "parameters"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "range_dimension"
     mtype: "<type \'property\'>"
@@ -75,7 +79,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'dtype\', \'graph_parents\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\', \'parameters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_to_tensor"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 741ab7fe017..1baea4b7414 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -176,6 +176,10 @@ tf_module {
     name: "in_top_k"
     argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "isotonic_regression"
+    argspec: "args=[\'inputs\', \'decreasing\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'-1\'], "
+  }
   member_method {
     name: "l2_loss"
     argspec: "args=[\'t\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index a4ed911e39d..b33838896ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -22,7 +22,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\', \'beta\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 81ad89bf2ff..6e228a20111 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1996,6 +1996,10 @@ tf_module {
     name: "IsVariableInitialized"
     argspec: "args=[\'ref\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "IsotonicRegression"
+    argspec: "args=[\'input\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "Iterator"
     argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 56651271c13..807a4315f0a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -10,7 +10,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'root\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "read"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
index 0f02c34b057..2e3f884b138 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
@@ -77,10 +77,3 @@ ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN pip3 install ${TF_PACKAGE}-cpu${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
-
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
index 0732cd834a3..c4812a2a03d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
@@ -76,8 +76,3 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN pip3 install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
-
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
index 4d58ad67df6..e660694ab78 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
@@ -78,8 +78,3 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
 RUN pip3 install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
-
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index a281afe7442..1ddc57d2ab5 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -54,7 +54,7 @@ function build_libtensorflow_tarball() {
   BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
-    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain"
     export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index f4961e896ee..c3daaba6a58 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -109,6 +109,7 @@ do_pylint() {
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
 "^tensorflow/python/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/keras/engine/base_layer.py.*\[E1102.*not-callable "\
 "^tensorflow/python/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned "\
 "^tensorflow/python/keras/utils/data_utils.py.*\[E1102.*not-callable "\
diff --git a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
index 060193614c3..87e5f8003cf 100644
--- a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
+++ b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
@@ -63,7 +63,10 @@ g++ --version
 
 # Install Horovod.
 cd ..
+HOROVOD_GPU_OPERATIONS=NCCL
 HOROVOD_WITH_TENSORFLOW=1
+HOROVOD_WITHOUT_PYTORCH=1
+HOROVOD_WITHOUT_MXNET=1
 pip3.7 install horovod[tensorflow] --user
 
 # Install tests.
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_libtensorflow.sh
deleted file mode 100644
index a0e3a7f4594..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_libtensorflow.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-
-# Source the external common scripts.
-source tensorflow/tools/ci_build/release/common.sh
-
-
-# Install latest bazel
-install_bazelisk
-which bazel
-
-# Install realpath
-sudo apt-get install realpath
-
-# Update the version string to nightly
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  ./tensorflow/tools/ci_build/update_version.py --nightly
-fi
-
-./tensorflow/tools/ci_build/linux/libtensorflow.sh
-
-# Copy the nightly version update script
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
-fi
-
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_nonpip.sh
deleted file mode 100644
index fee64f0beb1..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_pip.sh
deleted file mode 100644
index bdbb7f15e34..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py35_pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_nonpip.sh
deleted file mode 100644
index 6b05141f00f..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py36,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_pip.sh
deleted file mode 100644
index 6277291043c..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py36_pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_nonpip.sh
deleted file mode 100644
index db0c6056b6c..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_pip.sh
deleted file mode 100644
index ff88ae46f39..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py37_pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_nonpip.sh
deleted file mode 100644
index 36da30167d0..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.8)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py38,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_pip.sh
deleted file mode 100644
index 52872cfd0a6..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/cpu_py38_pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.8'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_libtensorflow.sh
deleted file mode 100644
index d294311d1ff..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_libtensorflow.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-
-# Source the external common scripts.
-source tensorflow/tools/ci_build/release/common.sh
-
-
-# Install latest bazel
-install_bazelisk
-which bazel
-
-# Install realpath
-sudo apt-get install realpath
-
-export TF_NEED_CUDA=1
-
-# Update the version string to nightly
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  ./tensorflow/tools/ci_build/update_version.py --nightly
-fi
-
-./tensorflow/tools/ci_build/linux/libtensorflow.sh
-
-# Copy the nightly version update script
-if [ -n "${IS_NIGHTLY_BUILD}" ]; then
-  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
-fi
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_pip_on_cpu.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_pip_on_cpu.sh
deleted file mode 100755
index 6e67bf20730..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_pip_on_cpu.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update Bazel to the desired version
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-########################
-## Build GPU pip package
-########################
-bazel build --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
-
-# Set TF nightly flag so we get the proper version of estimator
-if [[ "$IS_NIGHTLY" == 1 ]]; then
-  NIGHTLY_FLAG="--nightly_flag"
-fi
-
-PIP_WHL_DIR=whl
-mkdir -p ${PIP_WHL_DIR}
-PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
-bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
-WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
-
-cp "${WHL_PATH}" "$(pwd)"/.
-chmod +x tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
-docker run -e "BAZEL_VERSION=${BAZEL_VERSION}" -e "CI_BUILD_USER=$(id -u -n)" -e "CI_BUILD_UID=$(id -u)"  -e "CI_BUILD_GROUP=$(id -g -n)" -e "CI_BUILD_GID=$(id -g)"  -e "CI_BUILD_HOME=/bazel_pip" -v "$(pwd)":/bazel_pip tensorflow/tensorflow:devel "./bazel_pip/tensorflow/tools/ci_build/builds/with_the_same_user" "./bazel_pip/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_nonpip.sh
deleted file mode 100644
index 3e91bf787a9..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_nonpip.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-set +e
-ls /usr/include/cud*
-bazel test --config=cuda --config=opt -s \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_pip.sh
deleted file mode 100644
index 2a5c550890b..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py35_pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_nonpip.sh
deleted file mode 100644
index 70038a8d875..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.6)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_pip.sh
deleted file mode 100644
index 9aa724c27b9..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py36_pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.6
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.6'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_nonpip.sh
deleted file mode 100644
index 225b2cf4b7b..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_pip.sh
deleted file mode 100644
index d884a484167..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py37_pip.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.7'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37'
-# TODO (pkanwar): Revert this CL (cl/326069644) once the cuda 11 migration is complete.
-export TF_BUILD_FLAGS="--config=release_common "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
---config=cuda \
---config=tensorrt \
---action_env=CUDA_TOOLKIT_PATH=/usr/local/cuda-11.0 --action_env=TF_NEED_TENSORRT=1 \
---action_env=TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70 \
---action_env=TENSORRT_INSTALL_PATH=/usr/local/tensorrt \
---action_env=LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib \
---action_env=GCC_HOST_COMPILER_PATH=/usr/bin/gcc-5 \
---config=avx_linux \
---crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_nonpip.sh
deleted file mode 100644
index f7678b7436f..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-update_bazel_linux
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.8)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
-
-test +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_pip.sh
deleted file mode 100644
index d8838e7704a..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/gpu_py38_pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.8
-# Update bazel
-update_bazel_linux
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.8'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/sanity.sh b/tensorflow/tools/ci_build/rel/ubuntu_cuda11/sanity.sh
deleted file mode 100644
index 4fc600de867..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu_cuda11/sanity.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-
-# Install latest bazel
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-which bazel
-
-# We need py3 lint
-sudo pip3 install pep8
-
-# TODO(gunan): figure out why we get stuck with later versions of pylint.
-# Install pylint.
-sudo python3 -m pip install setuptools --upgrade
-sudo python2 -m pip install pylint==1.6.4
-sudo python3 -m pip install pylint==1.6.4
-
-# TODO(yifeif): print pylint version for debug. remove later.
-python3 -m pylint --version
-
-# Run tensorflow sanity checks.
-tensorflow/tools/ci_build/ci_sanity.sh
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/common_win_cuda11.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/common_win_cuda11.bat
deleted file mode 100644
index 81f2c86fa12..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/common_win_cuda11.bat
+++ /dev/null
@@ -1,24 +0,0 @@
-:: Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-echo on
-
-SET TF_CUDA_VERSION=11.0
-SET TF_CUDNN_VERSION=8
-
-REM TODO(sanjoy): This script should be removed once common_win.bat
-REM defaults to CUDA 11.
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_libtensorflow.bat
deleted file mode 100644
index e583c5eeabf..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_libtensorflow.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\cpu\bazel\run_libtensorflow.bat || exit /b 1
-
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py35.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py35.bat
deleted file mode 100644
index 0cca1e29703..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py35.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py36.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py36.bat
deleted file mode 100644
index c5ffe4b4b02..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py36.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py37.bat
deleted file mode 100644
index a7670ee49c6..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py37.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py38.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py38.bat
deleted file mode 100644
index 9aa5013c6b9..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/cpu_py38.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python38
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
-
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_libtensorflow.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_libtensorflow.bat
deleted file mode 100644
index bd15e83c24c..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_libtensorflow.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\gpu\bazel\run_libtensorflow.bat || exit /b
-
-copy lib_package %TF_ARTIFACTS_DIR%\lib_package
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_pip_on_cpu.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_pip_on_cpu.bat
deleted file mode 100644
index 207359b32e3..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_pip_on_cpu.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\integration\gpu_pip_on_cpu\run.bat
-
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py35.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py35.bat
deleted file mode 100644
index f4d68954ea6..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py35.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py36.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py36.bat
deleted file mode 100644
index 3e87cdb7e3f..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py36.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python36
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py37.bat
deleted file mode 100644
index 105258fa468..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py37.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py38.bat b/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py38.bat
deleted file mode 100644
index 94916342c3f..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda11/gpu_py38.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python38
-
-CALL tensorflow\tools\ci_build\rel\windows_cuda11\common_win_cuda11.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index c3b5bd9a867..a22556a7d86 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -142,6 +142,7 @@ function install_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install portpicker
   ${SUDO_CMD} ${PIP_CMD} install scipy
   ${SUDO_CMD} ${PIP_CMD} install scikit-learn
+  ${SUDO_CMD} ${PIP_CMD} install typing_extensions
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade flatbuffers
   ${PIP_CMD} install --user --upgrade attrs
@@ -178,6 +179,7 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install portpicker --user
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
+  "${PIP_CMD}" install typing_extensions --user
   "${PIP_CMD}" install PyYAML==3.13 --user
   # b/156523241
   "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 6b9b533e25c..23dc09a8d59 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -60,10 +60,10 @@ IF "%PYTHON_DIRECTORY%"=="Python37" (
 
 :: Set cuda related environment variables. If we are not using CUDA, these are not used.
 IF NOT DEFINED TF_CUDA_VERSION (
-  SET TF_CUDA_VERSION=10.1
+  SET TF_CUDA_VERSION=11.0
 )
 IF NOT DEFINED TF_CUDNN_VERSION (
-  SET TF_CUDNN_VERSION=7
+  SET TF_CUDNN_VERSION=8
 )
 SET TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
 SET CUDA_TOOLKIT_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index 6e67bf20730..e6821a49ba9 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -42,7 +42,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 ## Build GPU pip package
 ########################
 bazel build --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
 # Set TF nightly flag so we get the proper version of estimator
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
index 47ed3c4fd2a..8a0796723b2 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
index 2a5c550890b..f178ac0754e 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
index 70038a8d875..42de0e5d137 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
index 9aa724c27b9..9bc559a01ab 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
index 225b2cf4b7b..86bdd99de0f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
index 9bfc6608a0b..71d6f3e6401 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
index f7678b7436f..141a42fea62 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
 test +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
index d8838e7704a..f49b77bae70 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/sizetrack_helper.py b/tensorflow/tools/ci_build/sizetrack_helper.py
index e56009df332..0b42ff84e7e 100755
--- a/tensorflow/tools/ci_build/sizetrack_helper.py
+++ b/tensorflow/tools/ci_build/sizetrack_helper.py
@@ -127,7 +127,7 @@ CL_TRAILER = "PiperOrigin-RevId"
 PRETTY_COMMIT_DATE = "%cI"
 PRETTY_CL = "%(trailers:key={},valueonly)".format(CL_TRAILER)
 PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL)
-PRETTY_EARLY = "{cl}\t%aI\t%cI".format(cl=PRETTY_CL)
+PRETTY_EARLY = "%aI\t{cl}\t%cI".format(cl=PRETTY_CL)
 PRETTY_COMMIT = "%h"
 # This is a BigQuery table schema defined as CSV
 # See https://cloud.google.com/bigquery/docs/schemas
@@ -271,12 +271,13 @@ def get_all_tested_commits():
   if earliest_commit:
 
     earliest_commit = earliest_commit.splitlines()[-1]  # Ignore CSV header
-    early_cl, early_author_date, early_commit_date = git_pretty(
+    early_author_date, early_cl, early_commit_date = git_pretty(
         earliest_commit, PRETTY_EARLY, n=1)[0].split("\t")
 
     all_range = "{commit}..HEAD".format(commit=earliest_commit)
-    all_commits = ",".join(git_pretty(all_range, PRETTY_COMMIT))
-    all_changelists = ",".join(git_pretty(all_range, PRETTY_CL))
+    # Reversed: convert to chronological
+    all_commits = ",".join(reversed(git_pretty(all_range, PRETTY_COMMIT)))
+    all_changelists = ",".join(reversed(git_pretty(all_range, PRETTY_CL)))
 
     return [
         earliest_commit, early_cl, early_author_date, early_commit_date,
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 23016f7f3ed..e767a0cb765 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -55,8 +55,8 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
 
 # Setting default values to CUDA related environment variables
-export TF_CUDA_VERSION=${TF_CUDA_VERSION:-10.1}
-export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-7}
+export TF_CUDA_VERSION=${TF_CUDA_VERSION:-11.0}
+export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-8}
 export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-6.0}
 export CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
 export CUDNN_INSTALL_PATH=${CUDNN_INSTALL_PATH:-"C:/tools/cuda"}
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index b8bbbbd7bdf..83e01bdfd16 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -22,37 +22,34 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
-        libcublas-dev=10.2.1.243-1 \
+        libcublas-${CUDA/./-} \
+        libcublas-dev-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
         cuda-nvrtc-dev-${CUDA/./-} \
         cuda-cudart-dev-${CUDA/./-} \
-        cuda-cufft-dev-${CUDA/./-} \
-        cuda-curand-dev-${CUDA/./-} \
-        cuda-cusolver-dev-${CUDA/./-} \
-        cuda-cusparse-dev-${CUDA/./-} \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
-        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcufft-dev-${CUDA/./-} \
+        libcurand-dev-${CUDA/./-} \
+        libcusolver-dev-${CUDA/./-} \
+        libcusparse-dev-${CUDA/./-} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
+        libcudnn8-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -67,7 +64,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         && \
     find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v8.a
 
 # Install TensorRT if not building for PowerPC
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 81d50dccf9d..60a3e57c294 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -22,37 +22,34 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
-        libcublas-dev=10.2.1.243-1 \
+        libcublas-${CUDA/./-} \
+        libcublas-dev-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
         cuda-nvrtc-dev-${CUDA/./-} \
         cuda-cudart-dev-${CUDA/./-} \
-        cuda-cufft-dev-${CUDA/./-} \
-        cuda-curand-dev-${CUDA/./-} \
-        cuda-cusolver-dev-${CUDA/./-} \
-        cuda-cusparse-dev-${CUDA/./-} \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
-        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcufft-dev-${CUDA/./-} \
+        libcurand-dev-${CUDA/./-} \
+        libcusolver-dev-${CUDA/./-} \
+        libcusparse-dev-${CUDA/./-} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
+        libcudnn8-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -67,7 +64,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         && \
     find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v8.a
 
 # Install TensorRT if not building for PowerPC
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index d4d913ce34a..911678b2ce3 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -40,17 +40,14 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
+        libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
-        cuda-cufft-${CUDA/./-} \
-        cuda-curand-${CUDA/./-} \
-        cuda-cusolver-${CUDA/./-} \
-        cuda-cusparse-${CUDA/./-} \
+        libcufft-${CUDA/./-} \
+        libcurand-${CUDA/./-} \
+        libcusolver-${CUDA/./-} \
+        libcusparse-${CUDA/./-} \
         curl \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libzmq3-dev \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index f563f2fc909..228513d6736 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=18.04
 
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -40,17 +40,14 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
+        libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
-        cuda-cufft-${CUDA/./-} \
-        cuda-curand-${CUDA/./-} \
-        cuda-cusolver-${CUDA/./-} \
-        cuda-cusparse-${CUDA/./-} \
+        libcufft-${CUDA/./-} \
+        libcurand-${CUDA/./-} \
+        libcusolver-${CUDA/./-} \
+        libcusparse-${CUDA/./-} \
         curl \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libzmq3-dev \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..ee6abd862ed
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,132 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..daf92ea7e2d
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..32f935e5ff6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,117 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..11875008066
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod.Dockerfile
@@ -0,0 +1,103 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    software-properties-common
+
+RUN add-apt-repository ppa:ubuntu-toolchain-r/test
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 500 --slave /usr/bin/g++ g++ /usr/bin/g++-5 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..030fb86dbe5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,128 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..ad763a8626e
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod.Dockerfile
@@ -0,0 +1,114 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..0b4289284e3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..f570e927d76
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    python3 \
+    python3-pip
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python3) /usr/local/bin/python
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    python3-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 700 --slave /usr/bin/g++ g++ /usr/bin/g++-7 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..f123955e3d0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,138 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..d4abafe55b1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod.Dockerfile
@@ -0,0 +1,124 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        sudo \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+ARG TF_BRANCH=master
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git --branch "${TF_BRANCH}" --single-branch /tensorflow_src || true
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl
+
+# Install bazel
+ARG BAZEL_VERSION=3.1.0
+RUN mkdir /bazel && \
+    curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    bash /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1
+ARG CHECKOUT_HOROVOD_SRC=0
+ARG HOROVOD_BRANCH=master
+RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --branch "${HOROVOD_BRANCH}" --single-branch --recursive https://github.com/uber/horovod.git /horovod_src || true
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod-jupyter.Dockerfile
new file mode 100644
index 00000000000..65473aca585
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod-jupyter.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN python3 -m pip install --no-cache-dir jupyter matplotlib
+# Pin ipykernel and nbformat; see https://github.com/ipython/ipykernel/issues/422
+RUN python3 -m pip install --no-cache-dir jupyter_http_over_ws ipykernel==5.1.1 nbformat==4.4.0
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/ && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /tf
+EXPOSE 8888
+
+RUN python3 -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod.Dockerfile
new file mode 100644
index 00000000000..24bd164eab9
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod.Dockerfile
@@ -0,0 +1,104 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=20.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON}
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3.7
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
+ARG TF_PACKAGE=tensorflow
+ARG TF_PACKAGE_VERSION=
+RUN python3 -m pip install --no-cache-dir ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+ARG HOROVOD_WITHOUT_PYTORCH=1
+ARG HOROVOD_WITHOUT_MXNET=1
+ARG HOROVOD_WITH_TENSORFLOW=1
+ARG HOROVOD_VERSION=
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    g++-8 \
+    gcc-8 \
+    ${PYTHON}-dev
+
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 --slave /usr/bin/g++ g++ /usr/bin/g++-9 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 800 --slave /usr/bin/g++ g++ /usr/bin/g++-8
+
+RUN python3 -m pip install --no-cache-dir horovod${HOROVOD_VERSION:+==${HOROVOD_VERSION}}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-mpich.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-mpich.partial.Dockerfile
new file mode 100644
index 00000000000..1f7dd889057
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/1604-mpich.partial.Dockerfile
@@ -0,0 +1,28 @@
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Disable GCC noise for gcc newer than 5.x, otherwise Horovod installation fails
+RUN sed -i 's/# if __GNUC__ > 5/# if __GNUC__ > 9/g' /usr/include/mpich/mpicxx.h
+
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpich.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpich.partial.Dockerfile
new file mode 100644
index 00000000000..e69a800c9b7
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/mpich.partial.Dockerfile
@@ -0,0 +1,24 @@
+ARG DEBIAN_FRONTEND="noninteractive"
+
+# install mpich, openssh for MPI to communicate between containers
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    mpich \
+    libmpich-dev \
+    openssh-client \
+    openssh-server && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for MPICH to allow running as root by default
+RUN mv /usr/bin/mpirun /usr/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/bin/mpirun && \
+    echo 'mpirun.real "$@"' >> /usr/bin/mpirun && \
+    chmod a+x /usr/bin/mpirun
+
+# Set up SSH
+RUN mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index 5b4b2b7f60b..ed310f39ecf 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,35 +1,32 @@
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
-        libcublas-dev=10.2.1.243-1 \
+        libcublas-${CUDA/./-} \
+        libcublas-dev-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
         cuda-nvrtc-dev-${CUDA/./-} \
         cuda-cudart-dev-${CUDA/./-} \
-        cuda-cufft-dev-${CUDA/./-} \
-        cuda-curand-dev-${CUDA/./-} \
-        cuda-cusolver-dev-${CUDA/./-} \
-        cuda-cusparse-dev-${CUDA/./-} \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
-        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcufft-dev-${CUDA/./-} \
+        libcurand-dev-${CUDA/./-} \
+        libcusolver-dev-${CUDA/./-} \
+        libcusparse-dev-${CUDA/./-} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
+        libcudnn8-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -44,7 +41,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         && \
     find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v8.a
 
 # Install TensorRT if not building for PowerPC
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 555caf08cb7..b2a7b46a7cb 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,15 +1,15 @@
 ARG ARCH=
-ARG CUDA=10.1
+ARG CUDA=11.0
 FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=7.6.4.38-1
-ARG CUDNN_MAJOR_VERSION=7
+ARG CUDNN=8.0.2.39-1
+ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=6.0.1-1
-ARG LIBNVINFER_MAJOR_VERSION=6
+ARG LIBNVINFER=7.1.3-1
+ARG LIBNVINFER_MAJOR_VERSION=7
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -17,17 +17,14 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-${CUDA/./-} \
-        # There appears to be a regression in libcublas10=10.2.2.89-1 which
-        # prevents cublas from initializing in TF. See
-        # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
-        libcublas10=10.2.1.243-1 \ 
+        libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
-        cuda-cufft-${CUDA/./-} \
-        cuda-curand-${CUDA/./-} \
-        cuda-cusolver-${CUDA/./-} \
-        cuda-cusparse-${CUDA/./-} \
+        libcufft-${CUDA/./-} \
+        libcurand-${CUDA/./-} \
+        libcusolver-${CUDA/./-} \
+        libcusparse-${CUDA/./-} \
         curl \
-        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn8=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libzmq3-dev \
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 83829d73346..05f6c3c06e0 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -47,6 +47,10 @@ releases:
             - "{_TAG_PREFIX}{ubuntu-onednn-mpi-horovod}{onednn-jupyter}"
             - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpi-horovod}"
             - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpi-horovod}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-onednn-mpich-horovod}"
+            - "{_TAG_PREFIX}{ubuntu-onednn-mpich-horovod}{onednn-jupyter}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpich-horovod}"
+            - "{_TAG_PREFIX}{ubuntu-devel-onednn-mpich-horovod}{onednn-jupyter}"
 
     # Dockerfiles stored in the TF repo; not pushed anywhere
     dockerfiles:
@@ -65,6 +69,10 @@ releases:
             - "{ubuntu-devel-onednn-mpi-horovod}"
             - "{ubuntu-onednn-mpi-horovod}{onednn-jupyter}"
             - "{ubuntu-devel-onednn-mpi-horovod}{onednn-jupyter}"
+            - "{ubuntu-onednn-mpich-horovod}"
+            - "{ubuntu-devel-onednn-mpich-horovod}"
+            - "{ubuntu-onednn-mpich-horovod}{onednn-jupyter}"
+            - "{ubuntu-devel-onednn-mpich-horovod}{onednn-jupyter}"
             - "{ubuntu-devel-arm64v8}{jupyter}"
 
 slice_sets:
@@ -351,6 +359,116 @@ slice_sets:
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
 
+    ubuntu-onednn-mpich-horovod:
+        - add_to_name: "-16.04-onednn-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-onednn-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - onednn/ubuntu/1604-mpich
+              - onednn/ubuntu/1604-horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=16.04
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+        - add_to_name: "-18.04-onednn-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-onednn-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python
+              - tensorflow
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+        - add_to_name: "-20.04-onednn-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-onednn-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/cpu
+              - onednn/ubuntu/python3
+              - tensorflow
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/2004-horovod
+              - shell
+          tests:
+              - import-onednn-horovod.sh
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - DEBIAN_FRONTEND="noninteractive"
+              - TF_PACKAGE=intel-tensorflow
+
+    ubuntu-devel-onednn-mpich-horovod:
+        - add_to_name: "-16.04-onednn-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/1604-mpich
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=16.04
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+        - add_to_name: "-18.04-onednn-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
+        - add_to_name: "-20.04-onednn-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn-mpich-horovod"
+          dockerfile_subdirectory: "onednn"
+          partials:
+              - onednn/ubuntu/version
+              - onednn/ubuntu/devel
+              - onednn/ubuntu/python3
+              - onednn/ubuntu/bazel
+              - onednn/ubuntu/mpich
+              - onednn/ubuntu/devel-horovod
+              - shell
+          tests:
+              - ""
+          args:
+              - UBUNTU_VERSION=20.04
+              - PYTHON=python3.7
+              - CHECKOUT_TF_SRC=1
+              - CHECKOUT_HOROVOD_SRC=1
+              - HOROVOD_BRANCH=master
 
     ubuntu-onednn:
         - add_to_name: "-16.04-onednn"
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 44152ba30ef..66715ca2b5e 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -77,6 +77,10 @@ flags.DEFINE_string(
     "The path prefix (up to `.../api_docs/python`) used in the "
     "`_toc.yaml` and `_redirects.yaml` files")
 
+flags.DEFINE_bool("gen_report", False,
+                  ("Generate an API report containing the health of the"
+                   "docstrings of the public API."))
+
 _PRIVATE_MAP = {
     "tf": ["python", "core", "compiler", "examples", "tools", "contrib"],
     # There's some aliasing between the compats and v1/2s, so it's easier to
@@ -151,13 +155,15 @@ class TfExportAwareVisitor(doc_generator_visitor.DocGeneratorVisitor):
     return (canonical_score,) + scores
 
 
-def build_docs(output_dir, code_url_prefix, search_hints=True):
+def build_docs(output_dir, code_url_prefix, search_hints, gen_report):
   """Build api docs for tensorflow v2.
 
   Args:
     output_dir: A string path, where to put the files.
     code_url_prefix: prefix for "Defined in" links.
     search_hints: Bool. Include meta-data search hints at the top of each file.
+    gen_report: Bool. Generates an API report containing the health of the
+      docstrings of the public API.
   """
   # The custom page will be used for raw_ops.md not the one generated above.
   doc_controls.set_custom_page_content(tf.raw_ops, generate_raw_ops_doc())
@@ -208,7 +214,9 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
       code_url_prefix=code_url_prefixes,
       site_path=FLAGS.site_path,
       visitor_cls=TfExportAwareVisitor,
-      private_map=_PRIVATE_MAP)
+      private_map=_PRIVATE_MAP,
+      gen_report=gen_report,
+  )
 
   doc_generator.build(output_dir)
 
@@ -267,7 +275,8 @@ def main(argv):
   build_docs(
       output_dir=FLAGS.output_dir,
       code_url_prefix=FLAGS.code_url_prefix,
-      search_hints=FLAGS.search_hints)
+      search_hints=FLAGS.search_hints,
+      gen_report=FLAGS.gen_report,)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/docs/generate2_test.py b/tensorflow/tools/docs/generate2_test.py
index 57d3ec398b1..1392667b4c5 100644
--- a/tensorflow/tools/docs/generate2_test.py
+++ b/tensorflow/tools/docs/generate2_test.py
@@ -50,7 +50,12 @@ class Generate2Test(googletest.TestCase):
       shutil.rmtree(output_dir)
     os.makedirs(output_dir)
     with self.assertRaisesRegex(ValueError, '2000 files'):
-      generate2.build_docs(output_dir=output_dir, code_url_prefix='')
+      generate2.build_docs(
+          output_dir=output_dir,
+          code_url_prefix='',
+          search_hints=True,
+          gen_report=False,
+      )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 9cf6e10702f..74585cbb11d 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -152,6 +152,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/docs:py_guide_parser",
     "//tensorflow/python/distribute/client:client",
     "//tensorflow/python/distribute/client:parameter_server_client",
+    "//tensorflow/python/distribute/client:remote_eager_lib",
     "//tensorflow/python/distribute/client:metric_utils",
 ]
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 54021af9975..5917b0fca7f 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -67,6 +67,7 @@ REQUIRED_PACKAGES = [
     'tensorboard >= 2.3.0, < 3',
     'tensorflow_estimator >= 2.3.0, < 2.4.0',
     'termcolor >= 1.1.0',
+    'typing_extensions >= 3.7.4.2',
     'wrapt >= 1.11.1',
     'wheel >= 0.26',
     'six >= 1.12.0',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4524ade5ba1..5083b29a12f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "1edb168b8eb1b48e4ed7f8d18640c381ab19745cb21ea4279f27884339b6f17e",
-        strip_prefix = "XNNPACK-2a18f7ea635f3c10a4d920113e07b2e6ce038ac8",
+        sha256 = "742eb377e0d304a0bfcb64fccfee2b3fe27932a2d5a95a22bfbc7a6fb4459e1a",
+        strip_prefix = "XNNPACK-0af63ab36b899559bd1a92bbc327f8137e53c15c",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/2a18f7ea635f3c10a4d920113e07b2e6ce038ac8.zip",
-            "https://github.com/google/XNNPACK/archive/2a18f7ea635f3c10a4d920113e07b2e6ce038ac8.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/0af63ab36b899559bd1a92bbc327f8137e53c15c.zip",
+            "https://github.com/google/XNNPACK/archive/0af63ab36b899559bd1a92bbc327f8137e53c15c.zip",
         ],
     )
 
@@ -211,11 +211,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn_v1.BUILD"),
-        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
-        strip_prefix = "oneDNN-1.4",
+        sha256 = "aef4d2a726f76f5b98902491a1a4ac69954039aa8e5a1d67ef6ce58ed00e23a6",
+        strip_prefix = "oneDNN-1.5.1",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
-            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
         ],
     )
 
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "9d8cbf2bd665cbb7b684bf4c6c5482b98dc6965847108f260c077049da04bee8",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69",
+        sha256 = "a3c10a8c14f55e9f09f98b0a0ac6874c21bda91f65b7469d9b1f6925990e867b",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69/eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69/eigen-2ce2f5198929caab4b41a6ad1b9c93f67d8b9a69.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
         ],
     )
 
@@ -514,6 +514,18 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "typing_extensions_archive",
+        build_file = clean_dep("//third_party:typing_extensions.BUILD"),
+        sha256 = "79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae",
+        strip_prefix = "typing_extensions-3.7.4.2",
+        system_build_file = clean_dep("//third_party/systemlibs:six.BUILD"),
+        urls = [
+            "http://mirror.tensorflow.org/files.pythonhosted.org/packages/6a/28/d32852f2af6b5ead85d396249d5bdf450833f3a69896d76eb480d9c5e406/typing_extensions-3.7.4.2.tar.gz",
+            "https://files.pythonhosted.org/packages/6a/28/d32852f2af6b5ead85d396249d5bdf450833f3a69896d76eb480d9c5e406/typing_extensions-3.7.4.2.tar.gz",
+        ],
+    )
+
     tf_http_archive(
         name = "opt_einsum_archive",
         build_file = clean_dep("//third_party:opt_einsum.BUILD"),
@@ -657,19 +669,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "swig",
-        build_file = clean_dep("//third_party:swig.BUILD"),
-        sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
-        strip_prefix = "swig-3.0.8",
-        system_build_file = clean_dep("//third_party/systemlibs:swig.BUILD"),
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
-            "https://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
-            "https://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "curl",
         build_file = clean_dep("//third_party:curl.BUILD"),
@@ -712,8 +711,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "30c1633386e7cfb01c0a54b31ccf4c3a3873e71b"
-    LLVM_SHA256 = "0cd17329d0981a86558beaafd2ae982af03fcebc71a659d8c134f39cb3988b3b"
+    LLVM_COMMIT = "bad7d6b3735d1d855ffb07f32a272049cff085e6"
+    LLVM_SHA256 = "363948fc7b6ab6e87ba074ad40604f4cfe2cd2f0ce983108f445f6147233b877"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -892,15 +891,16 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    # The CUDA 11 toolkit ships with CUB.  We should be able to delete this rule
+    # once TF drops support for CUDA 10.
     tf_http_archive(
         name = "cub_archive",
         build_file = clean_dep("//third_party:cub.BUILD"),
-        patch_file = clean_dep("//third_party:cub.pr170.patch"),
-        sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
-        strip_prefix = "cub-1.8.0",
+        sha256 = "162514b3cc264ac89d91898b58450190b8192e2af1142cf8ccac2d59aa160dda",
+        strip_prefix = "cub-1.9.9",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip",
-            "https://github.com/NVlabs/cub/archive/1.8.0.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.9.9.zip",
+            "https://github.com/NVlabs/cub/archive/1.9.9.zip",
         ],
     )
 
diff --git a/third_party/cub.BUILD b/third_party/cub.BUILD
index a04347b21ee..29159c9dad3 100644
--- a/third_party/cub.BUILD
+++ b/third_party/cub.BUILD
@@ -20,7 +20,6 @@ filegroup(
 cc_library(
     name = "cub",
     hdrs = if_cuda([":cub_header_files"]),
-    include_prefix = "third_party",
     deps = [
         "@local_config_cuda//cuda:cuda_headers",
     ],
diff --git a/third_party/cub.pr170.patch b/third_party/cub.pr170.patch
deleted file mode 100644
index 5b7432e8858..00000000000
--- a/third_party/cub.pr170.patch
+++ /dev/null
@@ -1,48 +0,0 @@
-From fd6e7a61a16a17fa155cbd717de0c79001af71e6 Mon Sep 17 00:00:00 2001
-From: Artem Belevich <tra@google.com>
-Date: Mon, 23 Sep 2019 11:18:56 -0700
-Subject: [PATCH] Fix CUDA version detection in CUB
-
-This fixes the problem with CUB using deprecated shfl/vote instructions when CUB
-is compiled with clang (e.g. some TensorFlow builds).
----
- cub/util_arch.cuh | 3 ++-
- cub/util_type.cuh | 4 ++--
- 2 files changed, 4 insertions(+), 3 deletions(-)
-
-diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
-index 87c5ea2fb..9ad9d1cbb 100644
---- a/cub/util_arch.cuh
-+++ b/cub/util_arch.cuh
-@@ -44,7 +44,8 @@ namespace cub {
- 
- #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
- 
--#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
-+#if !defined(CUB_USE_COOPERATIVE_GROUPS) && \
-+    (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
-     #define CUB_USE_COOPERATIVE_GROUPS
- #endif
- 
-diff --git a/cub/util_type.cuh b/cub/util_type.cuh
-index 0ba41e1ed..b2433d735 100644
---- a/cub/util_type.cuh
-+++ b/cub/util_type.cuh
-@@ -37,7 +37,7 @@
- #include <limits>
- #include <cfloat>
- 
--#if (__CUDACC_VER_MAJOR__ >= 9)
-+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
-     #include <cuda_fp16.h>
- #endif
- 
-@@ -1063,7 +1063,7 @@ struct FpLimits<double>
- };
- 
- 
--#if (__CUDACC_VER_MAJOR__ >= 9)
-+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
- template <>
- struct FpLimits<__half>
- {
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index e5833e7cdbb..a4a21abc367 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -176,6 +176,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}"
+)
+
 cuda_header_library(
     name = "cupti_headers",
     hdrs = [":cuda-extras"],
@@ -224,3 +229,4 @@ py_library(
 )
 
 %{copy_rules}
+
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index 55a9ec3d1ab..cabfac28fc3 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -171,6 +171,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}"
+)
+
 cuda_header_library(
     name = "cupti_headers",
     hdrs = [":cuda-extras"],
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 70bb91159de..ea33963fe19 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -692,6 +692,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
     return struct(
         cuda_toolkit_path = toolkit_path,
         cuda_version = cuda_version,
+        cuda_version_major = cuda_major,
         cublas_version = cublas_version,
         cusolver_version = cusolver_version,
         curand_version = curand_version,
@@ -776,6 +777,7 @@ def _create_dummy_repository(repository_ctx):
             "%{curand_lib}": lib_name("curand", cpu_value),
             "%{cupti_lib}": lib_name("cupti", cpu_value),
             "%{cusparse_lib}": lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
             "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
@@ -1122,6 +1124,10 @@ def _create_local_cuda_repository(repository_ctx):
         },
     )
 
+    cub_actual = "@cub_archive//:cub"
+    if int(cuda_config.cuda_version_major) >= 11:
+        cub_actual = ":cuda_headers"
+
     repository_ctx.template(
         "cuda/BUILD",
         tpl_paths["cuda:BUILD"],
@@ -1137,6 +1143,7 @@ def _create_local_cuda_repository(repository_ctx):
             "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
             "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
             "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
+            "%{cub_actual}": cub_actual,
             "%{copy_rules}": "\n".join(copy_rules),
         },
     )
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
index cf8950b5bc7..3c233b4f5b0 100644
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -108,6 +108,7 @@ cc_library(
         ":rocfft",
         ":hiprand",
         ":miopen",
+        ":hipsparse",
     ],
 )
 
@@ -137,11 +138,9 @@ cc_library(
     ],
 )
 
-cc_import(
+cc_library(
     name = "hipsparse",
-    hdrs = glob(["rocm/include/hipsparse/**",]),
-    shared_library = "rocm/lib/%{hipsparse_lib}",
-    visibility = ["//visibility:public"],
+    data = ["rocm/lib/%{hipsparse_lib}"],
 )
 
 %{copy_rules}
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index dcc1d52688e..752f48aa25b 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -708,7 +708,7 @@ def _create_local_rocm_repository(repository_ctx):
         tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_rocm"],
         {
             "%{cpu_compiler}": str(cc),
-            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/bin/hipcc",
+            "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/hip/bin/hipcc",
             "%{hipcc_env}": _hipcc_env(repository_ctx),
             "%{hipcc_is_hipclang}": _hipcc_is_hipclang(repository_ctx, rocm_config, bash_bin),
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 13bc7bf2902..92d1535b5ee 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -685,26 +685,36 @@ cc_library(
     ],
 )
 
-gentbl(
-    name = "omp_gen",
-    tbl_outs = [("--gen-directive-decl", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
-    tblgen = ":llvm-tblgen",
-    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
-    td_srcs = glob([
+exports_files([
+    "include/llvm/Frontend/OpenMP/OMP.td",
+])
+
+filegroup(
+    name = "omp_td_files",
+    srcs = glob([
         "include/llvm/Frontend/OpenMP/*.td",
         "include/llvm/Frontend/Directive/*.td",
     ]),
 )
 
 gentbl(
-    name = "omp_gen_impl",
-    tbl_outs = [("--gen-directive-impl", "include/llvm/Frontend/OpenMP/OMP.cpp.inc")],
+    name = "omp_gen",
+    tbl_outs = [("--gen-directive-decl", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
     tblgen = ":llvm-tblgen",
     td_file = "include/llvm/Frontend/OpenMP/OMP.td",
-    td_srcs = glob([
-        "include/llvm/Frontend/OpenMP/*.td",
-        "include/llvm/Frontend/Directive/*.td",
-    ]),
+    td_srcs = [
+        ":omp_td_files",
+    ],
+)
+
+gentbl(
+    name = "omp_gen_impl",
+    tbl_outs = [("--gen-directive-impl", "include/llvm/Frontend/OpenMP/OMP.cpp")],
+    tblgen = ":llvm-tblgen",
+    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
+    td_srcs = [
+        ":omp_td_files",
+    ],
 )
 
 # TODO(b/159809163): autogenerate this after enabling release-mode ML
@@ -1749,6 +1759,7 @@ cc_library(
         "lib/CodeGen/*.c",
         "lib/CodeGen/*.cpp",
         "lib/CodeGen/*.inc",
+        "lib/CodeGen/LiveDebugValues/*.cpp",
         "lib/CodeGen/*.h",
     ]),
     hdrs = glob([
@@ -2094,7 +2105,7 @@ cc_library(
         "lib/Frontend/OpenMP/*.cpp",
         "lib/Frontend/OpenMP/*.inc",
         "lib/Frontend/OpenMP/*.h",
-    ]),
+    ]) + ["include/llvm/Frontend/OpenMP/OMP.cpp"],
     hdrs = glob([
         "include/llvm/Frontend/OpenMP/*.h",
         "include/llvm/Frontend/OpenMP/*.def",
@@ -2401,6 +2412,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "InterfaceStub",
+    srcs = glob([
+        "lib/InterfaceStub/*.c",
+        "lib/InterfaceStub/*.cpp",
+        "lib/InterfaceStub/*.inc",
+        "lib/InterfaceStub/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/InterfaceStub/*.h",
+        "include/llvm/InterfaceStub/*.def",
+        "include/llvm/InterfaceStub/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":Object",
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_library(
     name = "Interpreter",
     srcs = glob([
@@ -4084,8 +4116,6 @@ cc_library(
         "include/llvm/TextAPI/*.def",
         "include/llvm/TextAPI/*.inc",
     ]) + [
-        "include/llvm/TextAPI/ELF/TBEHandler.h",
-        "include/llvm/TextAPI/ELF/ELFStub.h",
         "include/llvm/TextAPI/MachO/Architecture.def",
         "include/llvm/TextAPI/MachO/PackedVersion.h",
         "include/llvm/TextAPI/MachO/InterfaceFile.h",
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 592a28e01a8..445b5474065 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -59,8 +59,8 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "4",
-        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_MINOR@": "5",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
 )
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index eeb78e0544b..94129a29b84 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -24,14 +24,6 @@ exports_files([
     "run_lit.sh",
 ])
 
-cc_library(
-    name = "DialectSymbolRegistry",
-    # strip_include_prefix does not apply to textual_hdrs.
-    hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
-    strip_include_prefix = "include/mlir/IR",
-    textual_hdrs = ["include/mlir/IR/DialectSymbolRegistry.def"],
-)
-
 [
     gentbl(
         name = name + "IncGen",
@@ -69,11 +61,12 @@ cc_library(
         "include/mlir/IR/*.h",
     ]) + [
         "include/mlir/Interfaces/CallInterfaces.h",
+        "include/mlir/Interfaces/DecodeAttributesInterfaces.h",
+        "include/mlir/Interfaces/FoldInterfaces.h",
     ],
     includes = ["include"],
     deps = [
         ":CallOpInterfacesIncGen",
-        ":DialectSymbolRegistry",
         ":InferTypeOpInterfaceIncGen",
         ":OpAsmInterfaceIncGen",
         ":RegionKindInterfaceIncGen",
@@ -126,10 +119,19 @@ cc_library(
 cc_library(
     name = "CAPIIR",
     srcs = [
+        "lib/CAPI/IR/AffineMap.cpp",
         "lib/CAPI/IR/IR.cpp",
+        "lib/CAPI/IR/StandardAttributes.cpp",
+        "lib/CAPI/IR/StandardTypes.cpp",
     ],
     hdrs = [
+        "include/mlir-c/AffineMap.h",
         "include/mlir-c/IR.h",
+        "include/mlir-c/StandardAttributes.h",
+        "include/mlir-c/StandardTypes.h",
+        "include/mlir/CAPI/AffineMap.h",
+        "include/mlir/CAPI/IR.h",
+        "include/mlir/CAPI/Wrap.h",
     ],
     includes = ["include"],
     deps = [
@@ -155,23 +157,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "EDSCInterface",
-    srcs = [
-        "lib/EDSC/CoreAPIs.cpp",
-    ],
-    hdrs = [
-        "include/mlir-c/Core.h",
-    ],
-    includes = ["include"],
-    deps = [
-        ":IR",
-        ":Parser",
-        ":Support",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
 filegroup(
     name = "OpBaseTdFiles",
     srcs = [
@@ -982,9 +967,14 @@ cc_library(
             "lib/Support/MlirOptMain.cpp",
         ],
     ),
-    hdrs = glob([
-        "include/mlir/Support/*.h",
-    ]),
+    hdrs = glob(
+        [
+            "include/mlir/Support/*.h",
+        ],
+        exclude = [
+            "include/mlir/Support/MlirOptMain.h",
+        ],
+    ),
     includes = ["include"],
     deps = [
         "@llvm-project//llvm:Support",
@@ -1136,6 +1126,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LLVMOpsIncGen",
+        ":OpenMPDialect",
         ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1774,6 +1765,61 @@ gentbl(
     ],
 )
 
+cc_library(
+    name = "PDLDialect",
+    srcs = glob([
+        "lib/Dialect/PDL/IR/*.cpp",
+        "lib/Dialect/PDL/IR/*.h",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/PDL/IR/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":InferTypeOpInterface",
+        ":PDLOpsIncGen",
+        ":SideEffects",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+filegroup(
+    name = "PDLOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/PDL/IR/PDLBase.td",
+        "include/mlir/Dialect/PDL/IR/PDLOps.td",
+        "include/mlir/IR/SymbolInterfaces.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "PDLOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/PDL/IR/PDLOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/PDL/IR/PDLOps.cpp.inc",
+        ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/PDL/IR/PDLOpsDialect.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/PDL/IR/PDLOps.td",
+    td_srcs = [
+        ":PDLOpsTdFiles",
+    ],
+)
+
 # TODO(gcmn): Update SPIRV dependencies so that they map better to cmake files.
 filegroup(
     name = "SPIRVOpsTdFiles",
@@ -2776,6 +2822,9 @@ cc_library(
     srcs = [
         "lib/Support/MlirOptMain.cpp",
     ],
+    hdrs = [
+        "include/mlir/Support/MlirOptMain.h",
+    ],
     includes = ["include"],
     deps = [
         ":Analysis",
@@ -2883,7 +2932,9 @@ cc_library(
         ":LinalgToStandard",
         ":LinalgTransforms",
         ":NVVMDialect",
+        ":OpenACCDialect",
         ":OpenMPDialect",
+        ":PDLDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
@@ -2925,9 +2976,8 @@ cc_library(
     alwayslink = 1,
 )
 
-# TODO(jpienaar): This library should be removed.
-cc_library(
-    name = "MlirOptMain",
+cc_binary(
+    name = "mlir-opt",
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
@@ -2937,22 +2987,11 @@ cc_library(
         ":Analysis",
         ":IR",
         ":MlirOptLib",
-        ":Pass",
-        ":Support",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_binary(
-    name = "mlir-opt",
-    deps = [
-        ":Analysis",
-        ":IR",
-        ":MlirOptLib",
-        ":MlirOptMain",
         ":OpenMPDialect",
+        ":Pass",
         ":QuantOps",
         ":SCFToGPUPass",
+        ":Support",
         ":Transforms",
         "@llvm-project//llvm:AllTargetsCodeGens",
         "@llvm-project//llvm:Support",
@@ -3184,7 +3223,84 @@ cc_binary(
     ],
 )
 
+## OpenACC dialect
+
+gentbl(
+    name = "OpenACCOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-dialect-decls -dialect=acc",
+            "include/mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc",
+        ),
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/OpenACC/OpenACCOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/OpenACC/OpenACCOps.cpp.inc",
+        ),
+        (
+            "-gen-enum-decls",
+            "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc",
+        ),
+        (
+            "-gen-enum-defs",
+            "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc",
+        ),
+        (
+            "-gen-op-doc",
+            "g3doc/Dialects/OpenACC/OpenACCOps.md",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/OpenACC/OpenACCOps.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+        ":OmpCommonTdGen",
+    ],
+)
+
+cc_library(
+    name = "OpenACCDialect",
+    srcs = glob(
+        [
+            "lib/Dialect/OpenACC/IR/*.cpp",
+            "lib/Dialect/OpenACC/IR/*.h",
+        ],
+    ),
+    hdrs = glob([
+        "include/mlir/Dialect/OpenACC/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":OpenACCOpsIncGen",
+        ":StandardOps",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 ## OpenMP dialect
+gentbl(
+    name = "OmpCommonTdGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-directive-decl",
+            "include/mlir/Dialect/OpenMP/OmpCommon.td",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "@llvm-project//llvm:include/llvm/Frontend/OpenMP/OMP.td",
+    td_includes = ["external/llvm-project/llvm/include"],
+    td_srcs = [
+        "@llvm-project//llvm:omp_td_files",
+        ":OpBaseTdFiles",
+    ],
+)
+
 gentbl(
     name = "OpenMPOpsIncGen",
     strip_include_prefix = "include",
@@ -3218,6 +3334,8 @@ gentbl(
     td_file = "include/mlir/Dialect/OpenMP/OpenMPOps.td",
     td_srcs = [
         ":OpBaseTdFiles",
+        ":OmpCommonTdGen",
+        "include/mlir/Dialect/OpenMP/OmpCommon.td",
     ],
 )
 
@@ -3241,6 +3359,7 @@ cc_library(
     ],
 )
 
+## QuantOps dialect
 filegroup(
     name = "QuantizationOpsTdFiles",
     srcs = [
@@ -3251,7 +3370,6 @@ filegroup(
     ],
 )
 
-## QuantOps dialect
 gentbl(
     name = "QuantOpsIncGen",
     strip_include_prefix = "include",
@@ -3483,6 +3601,7 @@ cc_library(
         ":LinalgOps",
         ":LinalgTransforms",
         ":Pass",
+        ":SCFDialect",
         ":SCFToStandard",
         ":StandardOps",
         ":StandardToLLVM",
@@ -3720,6 +3839,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
+        ":LinalgTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index f507842a639..bea0710db89 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -180,10 +180,13 @@ cc_library(
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SPIRVDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
diff --git a/third_party/swig.BUILD b/third_party/swig.BUILD
deleted file mode 100644
index 59a3d9e6714..00000000000
--- a/third_party/swig.BUILD
+++ /dev/null
@@ -1,336 +0,0 @@
-licenses(["restricted"])  # GPLv3
-
-exports_files(["LICENSE"])
-
-cc_binary(
-    name = "swig",
-    srcs = [
-        "Source/CParse/cparse.h",
-        "Source/CParse/cscanner.c",
-        "Source/CParse/parser.c",
-        "Source/CParse/parser.h",
-        "Source/CParse/templ.c",
-        "Source/CParse/util.c",
-        "Source/DOH/base.c",
-        "Source/DOH/doh.h",
-        "Source/DOH/dohint.h",
-        "Source/DOH/file.c",
-        "Source/DOH/fio.c",
-        "Source/DOH/hash.c",
-        "Source/DOH/list.c",
-        "Source/DOH/memory.c",
-        "Source/DOH/string.c",
-        "Source/DOH/void.c",
-        "Source/Include/swigconfig.h",
-        "Source/Include/swigwarn.h",
-        "Source/Modules/allocate.cxx",
-        "Source/Modules/browser.cxx",
-        "Source/Modules/contract.cxx",
-        "Source/Modules/directors.cxx",
-        "Source/Modules/emit.cxx",
-        "Source/Modules/lang.cxx",
-        "Source/Modules/main.cxx",
-        "Source/Modules/module.cxx",
-        "Source/Modules/nested.cxx",
-        "Source/Modules/overload.cxx",
-        "Source/Modules/python.cxx",
-        "Source/Modules/swigmain-lite.cxx",
-        "Source/Modules/swigmod.h",
-        "Source/Modules/typepass.cxx",
-        "Source/Modules/uffi.cxx",
-        "Source/Modules/utils.cxx",
-        "Source/Modules/xml.cxx",
-        "Source/Preprocessor/cpp.c",
-        "Source/Preprocessor/expr.c",
-        "Source/Preprocessor/preprocessor.h",
-        "Source/Swig/cwrap.c",
-        "Source/Swig/deprecate.c",
-        "Source/Swig/error.c",
-        "Source/Swig/extend.c",
-        "Source/Swig/fragment.c",
-        "Source/Swig/getopt.c",
-        "Source/Swig/include.c",
-        "Source/Swig/misc.c",
-        "Source/Swig/naming.c",
-        "Source/Swig/parms.c",
-        "Source/Swig/scanner.c",
-        "Source/Swig/stype.c",
-        "Source/Swig/swig.h",
-        "Source/Swig/swigfile.h",
-        "Source/Swig/swigopt.h",
-        "Source/Swig/swigparm.h",
-        "Source/Swig/swigscan.h",
-        "Source/Swig/swigtree.h",
-        "Source/Swig/swigwrap.h",
-        "Source/Swig/symbol.c",
-        "Source/Swig/tree.c",
-        "Source/Swig/typemap.c",
-        "Source/Swig/typeobj.c",
-        "Source/Swig/typesys.c",
-        "Source/Swig/wrapfunc.c",
-    ],
-    copts = ["$(STACK_FRAME_UNLIMITED)"] + select({
-        ":windows": [],
-        "//conditions:default": [
-            "-Wno-parentheses",
-            "-Wno-unused-variable",
-            "-fexceptions",
-        ],
-    }),
-    data = [":templates"],
-    includes = [
-        "Source/CParse",
-        "Source/DOH",
-        "Source/Include",
-        "Source/Modules",
-        "Source/Preprocessor",
-        "Source/Swig",
-    ],
-    output_licenses = ["unencumbered"],
-    visibility = ["//visibility:public"],
-    deps = ["@pcre"],
-)
-
-filegroup(
-    name = "templates",
-    srcs = [
-        "Lib/allkw.swg",
-        "Lib/attribute.i",
-        "Lib/carrays.i",
-        "Lib/cdata.i",
-        "Lib/cffi/cffi.swg",
-        "Lib/cmalloc.i",
-        "Lib/constraints.i",
-        "Lib/cpointer.i",
-        "Lib/cstring.i",
-        "Lib/cwstring.i",
-        "Lib/exception.i",
-        "Lib/intrusive_ptr.i",
-        "Lib/inttypes.i",
-        "Lib/linkruntime.c",
-        "Lib/math.i",
-        "Lib/pointer.i",
-        "Lib/python/argcargv.i",
-        "Lib/python/attribute.i",
-        "Lib/python/boost_shared_ptr.i",
-        "Lib/python/builtin.swg",
-        "Lib/python/carrays.i",
-        "Lib/python/ccomplex.i",
-        "Lib/python/cdata.i",
-        "Lib/python/cmalloc.i",
-        "Lib/python/cni.i",
-        "Lib/python/complex.i",
-        "Lib/python/cpointer.i",
-        "Lib/python/cstring.i",
-        "Lib/python/cwstring.i",
-        "Lib/python/defarg.swg",
-        "Lib/python/director.swg",
-        "Lib/python/embed.i",
-        "Lib/python/embed15.i",
-        "Lib/python/exception.i",
-        "Lib/python/factory.i",
-        "Lib/python/file.i",
-        "Lib/python/implicit.i",
-        "Lib/python/jstring.i",
-        "Lib/python/pyabc.i",
-        "Lib/python/pyapi.swg",
-        "Lib/python/pybackward.swg",
-        "Lib/python/pybuffer.i",
-        "Lib/python/pyclasses.swg",
-        "Lib/python/pycomplex.swg",
-        "Lib/python/pycontainer.swg",
-        "Lib/python/pydocs.swg",
-        "Lib/python/pyerrors.swg",
-        "Lib/python/pyfragments.swg",
-        "Lib/python/pyhead.swg",
-        "Lib/python/pyinit.swg",
-        "Lib/python/pyiterators.swg",
-        "Lib/python/pymacros.swg",
-        "Lib/python/pyname_compat.i",
-        "Lib/python/pyopers.swg",
-        "Lib/python/pyprimtypes.swg",
-        "Lib/python/pyrun.swg",
-        "Lib/python/pyruntime.swg",
-        "Lib/python/pystdcommon.swg",
-        "Lib/python/pystrings.swg",
-        "Lib/python/python.swg",
-        "Lib/python/pythonkw.swg",
-        "Lib/python/pythreads.swg",
-        "Lib/python/pytuplehlp.swg",
-        "Lib/python/pytypemaps.swg",
-        "Lib/python/pyuserdir.swg",
-        "Lib/python/pywstrings.swg",
-        "Lib/python/std_alloc.i",
-        "Lib/python/std_auto_ptr.i",
-        "Lib/python/std_basic_string.i",
-        "Lib/python/std_carray.i",
-        "Lib/python/std_char_traits.i",
-        "Lib/python/std_common.i",
-        "Lib/python/std_complex.i",
-        "Lib/python/std_container.i",
-        "Lib/python/std_deque.i",
-        "Lib/python/std_except.i",
-        "Lib/python/std_ios.i",
-        "Lib/python/std_iostream.i",
-        "Lib/python/std_list.i",
-        "Lib/python/std_map.i",
-        "Lib/python/std_multimap.i",
-        "Lib/python/std_multiset.i",
-        "Lib/python/std_pair.i",
-        "Lib/python/std_set.i",
-        "Lib/python/std_shared_ptr.i",
-        "Lib/python/std_sstream.i",
-        "Lib/python/std_streambuf.i",
-        "Lib/python/std_string.i",
-        "Lib/python/std_unordered_map.i",
-        "Lib/python/std_unordered_multimap.i",
-        "Lib/python/std_unordered_multiset.i",
-        "Lib/python/std_unordered_set.i",
-        "Lib/python/std_vector.i",
-        "Lib/python/std_vectora.i",
-        "Lib/python/std_wios.i",
-        "Lib/python/std_wiostream.i",
-        "Lib/python/std_wsstream.i",
-        "Lib/python/std_wstreambuf.i",
-        "Lib/python/std_wstring.i",
-        "Lib/python/stl.i",
-        "Lib/python/typemaps.i",
-        "Lib/python/wchar.i",
-        "Lib/runtime.swg",
-        "Lib/shared_ptr.i",
-        "Lib/std/_std_deque.i",
-        "Lib/std/std_alloc.i",
-        "Lib/std/std_basic_string.i",
-        "Lib/std/std_carray.swg",
-        "Lib/std/std_char_traits.i",
-        "Lib/std/std_common.i",
-        "Lib/std/std_container.i",
-        "Lib/std/std_deque.i",
-        "Lib/std/std_except.i",
-        "Lib/std/std_ios.i",
-        "Lib/std/std_iostream.i",
-        "Lib/std/std_list.i",
-        "Lib/std/std_map.i",
-        "Lib/std/std_multimap.i",
-        "Lib/std/std_multiset.i",
-        "Lib/std/std_pair.i",
-        "Lib/std/std_queue.i",
-        "Lib/std/std_set.i",
-        "Lib/std/std_sstream.i",
-        "Lib/std/std_stack.i",
-        "Lib/std/std_streambuf.i",
-        "Lib/std/std_string.i",
-        "Lib/std/std_unordered_map.i",
-        "Lib/std/std_unordered_multimap.i",
-        "Lib/std/std_unordered_multiset.i",
-        "Lib/std/std_unordered_set.i",
-        "Lib/std/std_vector.i",
-        "Lib/std/std_vectora.i",
-        "Lib/std/std_wios.i",
-        "Lib/std/std_wiostream.i",
-        "Lib/std/std_wsstream.i",
-        "Lib/std/std_wstreambuf.i",
-        "Lib/std/std_wstring.i",
-        "Lib/std_except.i",
-        "Lib/stdint.i",
-        "Lib/stl.i",
-        "Lib/swig.swg",
-        "Lib/swigarch.i",
-        "Lib/swigerrors.swg",
-        "Lib/swiginit.swg",
-        "Lib/swiglabels.swg",
-        "Lib/swigrun.i",
-        "Lib/swigrun.swg",
-        "Lib/swigwarn.swg",
-        "Lib/swigwarnings.swg",
-        "Lib/typemaps/attribute.swg",
-        "Lib/typemaps/carrays.swg",
-        "Lib/typemaps/cdata.swg",
-        "Lib/typemaps/cmalloc.swg",
-        "Lib/typemaps/cpointer.swg",
-        "Lib/typemaps/cstring.swg",
-        "Lib/typemaps/cstrings.swg",
-        "Lib/typemaps/cwstring.swg",
-        "Lib/typemaps/enumint.swg",
-        "Lib/typemaps/exception.swg",
-        "Lib/typemaps/factory.swg",
-        "Lib/typemaps/fragments.swg",
-        "Lib/typemaps/implicit.swg",
-        "Lib/typemaps/inoutlist.swg",
-        "Lib/typemaps/misctypes.swg",
-        "Lib/typemaps/primtypes.swg",
-        "Lib/typemaps/ptrtypes.swg",
-        "Lib/typemaps/std_except.swg",
-        "Lib/typemaps/std_string.swg",
-        "Lib/typemaps/std_strings.swg",
-        "Lib/typemaps/std_wstring.swg",
-        "Lib/typemaps/string.swg",
-        "Lib/typemaps/strings.swg",
-        "Lib/typemaps/swigmacros.swg",
-        "Lib/typemaps/swigobject.swg",
-        "Lib/typemaps/swigtype.swg",
-        "Lib/typemaps/swigtypemaps.swg",
-        "Lib/typemaps/traits.swg",
-        "Lib/typemaps/typemaps.swg",
-        "Lib/typemaps/valtypes.swg",
-        "Lib/typemaps/void.swg",
-        "Lib/typemaps/wstring.swg",
-        "Lib/wchar.i",
-        "Lib/windows.i",
-    ],
-    licenses = ["notice"],  # simple notice license for Lib/
-    path = "Lib",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "swigconfig",
-    outs = ["Source/Include/swigconfig.h"],
-    cmd = "cat <<EOF >$@\n" +
-          "#define HAVE_BOOL\n" +
-          "#define HAVE_PCRE\n" +
-          "#define HAVE_POPEN\n" +
-          "#define PACKAGE_BUGREPORT \"http://www.swig.org\"\n" +
-          "#define PACKAGE_VERSION \"3.0.8\"\n" +
-          "#define STDC_HEADERS\n" +
-          "#define SWIG_CXX \"bazel4lyfe\"\n" +
-          "#define SWIG_LIB \"external/swig/Lib\"\n" +
-          "#define SWIG_LIB_WIN_UNIX \"\"\n" +
-          "#define SWIG_PLATFORM \"bazel4lyfe\"\n" +
-          "EOF",
-)
-
-genrule(
-    name = "get_rid_of_stuff_we_dont_need_yet",
-    srcs = ["Source/Modules/swigmain.cxx"],
-    outs = ["Source/Modules/swigmain-lite.cxx"],
-    cmd = "sed -e '/swig_allegrocl/d'" +
-          "    -e '/swig_cffi/d'" +
-          "    -e '/swig_chicken/d'" +
-          "    -e '/swig_clisp/d'" +
-          "    -e '/swig_csharp/d'" +
-          "    -e '/swig_d/d'" +
-          "    -e '/swig_go/d'" +
-          "    -e '/swig_guile/d'" +
-          "    -e '/swig_java/d'" +
-          "    -e '/swig_lua/d'" +
-          "    -e '/swig_modula3/d'" +
-          "    -e '/swig_mzscheme/d'" +
-          "    -e '/swig_ocaml/d'" +
-          "    -e '/swig_octave/d'" +
-          "    -e '/swig_perl/d'" +
-          "    -e '/swig_php/d'" +
-          "    -e '/swig_pike/d'" +
-          "    -e '/swig_r/d'" +
-          "    -e '/swig_ruby/d'" +
-          "    -e '/swig_scilab/d'" +
-          "    -e '/swig_sexp/d'" +
-          "    -e '/swig_tcl/d'" +
-          "    -e '/swig_uffi/d'" +
-          "    $< >$@",
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-)
diff --git a/third_party/systemlibs/swig.BUILD b/third_party/systemlibs/swig.BUILD
deleted file mode 100644
index 4c9b74dadbc..00000000000
--- a/third_party/systemlibs/swig.BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-licenses(["restricted"])  # GPLv3
-
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "templates",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "lnswiglink",
-    outs = ["swiglink"],
-    cmd = "ln -s $$(which swig) $@",
-)
-
-sh_binary(
-    name = "swig",
-    srcs = ["swiglink"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 217c0131186..76948f2c2cb 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -41,7 +41,6 @@ VALID_LIBS = [
     "pybind11",
     "six_archive",
     "snappy",
-    "swig",
     "termcolor_archive",
     "wrapt",
     "zlib",
diff --git a/third_party/typing_extensions.BUILD b/third_party/typing_extensions.BUILD
new file mode 100644
index 00000000000..efd526cd491
--- /dev/null
+++ b/third_party/typing_extensions.BUILD
@@ -0,0 +1,14 @@
+# Description:
+#   Backports for the typing module to older Python versions. See
+#   https://github.com/python/typing/blob/master/typing_extensions/README.rst
+
+licenses(["notice"])  # PSF
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "typing_extensions",
+    srcs = ["src_py3/typing_extensions.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)