diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index dbe1b6d656c..bc9a5fd9442 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -3,6 +3,10 @@
 # Targets in this directory are pure C++ "Classes" underlying the C API types
 # under tf/c/experimental/saved_model/public/. They are subject to change and
 # have visibility limited to Tensorflow's implementation only.
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
 
 package(
     default_visibility = [
@@ -47,6 +51,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "saved_model_utils",
+    srcs = [
+        "saved_model_utils.cc",
+    ],
+    hdrs = [
+        "saved_model_utils.h",
+    ],
+    deps = [
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tf_saved_model_impl",
     srcs = [
@@ -84,3 +104,26 @@ filegroup(
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
+
+tf_cc_test(
+    name = "saved_model_utils_test",
+    srcs = [
+        "saved_model_utils_test.cc",
+    ],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:core_cpu_lib",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:core",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
new file mode 100644
index 00000000000..ad3844e00a0
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -0,0 +1,39 @@
+# This package contains classes corresponding to Revived SavedObjectGraph types
+# used by SavedModel. See https://cs.opensource.google/tensorflow/tensorflow/+/c575e2ba93c442121d98d3f125d83fed1339924d:tensorflow/core/protobuf/saved_object_graph.proto;l=56-62
+package(
+    default_visibility = [
+        # Restricting visibility for now
+        "//tensorflow/c/experimental/saved_model/core:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "constant",
+    srcs = [
+        "constant.cc",
+    ],
+    hdrs = [
+        "constant.h",
+    ],
+    deps = [
+        ":tensorhandle_convertible",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_convertible",
+    hdrs = [
+        "tensorhandle_convertible.h",
+    ],
+    deps = [
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/constant.cc b/tensorflow/c/experimental/saved_model/core/revived_types/constant.cc
new file mode 100644
index 00000000000..0cabf83a123
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/constant.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+
+#include <memory>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+Constant::Constant(ImmediateTensorHandlePtr handle)
+    : TensorHandleConvertible(std::move(handle)) {}
+
+Status Constant::Create(ImmediateExecutionContext* ctx,
+                        AbstractTensorInterface* tensor,
+                        std::unique_ptr<Constant>* output) {
+  ImmediateExecutionTensorHandle* handle = ctx->CreateLocalHandle(tensor);
+  if (handle == nullptr) {
+    return errors::Internal("Failed to convert tensor to tensorhandle");
+  }
+  output->reset(new Constant(ImmediateTensorHandlePtr(handle)));
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/constant.h b/tensorflow/c/experimental/saved_model/core/revived_types/constant.h
new file mode 100644
index 00000000000..845a6f391c0
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/constant.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_CONSTANT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_CONSTANT_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+
+// This class corresponds to python's tf.constant, which is effectively a
+// TensorHandle explicitly initialized to some value.
+// For now this doesn't do much beyond wrap Context's CreateLocalHandle method,
+// and offer a subclass of TensorHandleConvertible. Note that similar to
+// the python's eager mode logic, we bypass calling the "Const" op:
+// https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/python/framework/constant_op.py#L301
+class Constant : public TensorHandleConvertible {
+ public:
+  static Status Create(ImmediateExecutionContext* ctx,
+                       AbstractTensorInterface* tensor,
+                       std::unique_ptr<Constant>* output);
+
+  // RevivedConstant is movable, but not copyable.
+  Constant(Constant&& other) = default;
+  Constant& operator=(Constant&& other) = default;
+
+  ~Constant() override = default;
+
+ private:
+  explicit Constant(ImmediateTensorHandlePtr handle);
+  Constant(const Constant&) = delete;
+  Constant& operator=(const Constant&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_CONSTANT_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h b/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
new file mode 100644
index 00000000000..98179586e83
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSORHANDLE_CONVERTIBLE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSORHANDLE_CONVERTIBLE_H_
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+
+namespace tensorflow {
+
+// A common interface for objects that can be converted to a TensorHandle.
+// Examples of objects that implement this include Variables, Constants, Assets,
+// etc. This is used to convert captured objects into a ConcreteFunction's
+// captured TensorHandles:
+// https://github.com/tensorflow/tensorflow/blob/676a68963ea4b64fe479b9cede06aa8f5b290ab8/tensorflow/python/saved_model/load.py#L229-L240
+class TensorHandleConvertible {
+ public:
+  explicit TensorHandleConvertible(ImmediateTensorHandlePtr handle)
+      : handle_(std::move(handle)) {}
+
+  ImmediateExecutionTensorHandle* handle() { return handle_.get(); }
+
+  // TensorHandleConvertible is movable, but not copyable.
+  TensorHandleConvertible(TensorHandleConvertible&& other) = default;
+  TensorHandleConvertible& operator=(TensorHandleConvertible&& other) = default;
+
+  virtual ~TensorHandleConvertible() = default;
+
+ protected:
+  TensorHandleConvertible(const TensorHandleConvertible&) = delete;
+  TensorHandleConvertible& operator=(const TensorHandleConvertible&) = delete;
+  ImmediateTensorHandlePtr handle_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSORHANDLE_CONVERTIBLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
new file mode 100644
index 00000000000..9fe9caa27d7
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+
+namespace tensorflow {
+namespace internal {
+
+Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
+                             const TensorProto& proto,
+                             std::unique_ptr<Constant>* output) {
+  tensorflow::Tensor tensor;
+  bool parse_result = tensor.FromProto(proto);
+  if (!parse_result) {
+    return errors::Internal("Failed to parse tensor from tensorproto");
+  }
+
+  TensorInterface tensor_interface(std::move(tensor));
+  return Constant::Create(ctx, &tensor_interface, output);
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
new file mode 100644
index 00000000000..5223f1c5f7d
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
+
+// Some internal utility functions for the SavedModelAPI, factored out into a
+// separately unit-testable header.
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Load a TensorProto into a tensorflow::Constant. This is similar to the
+// constant loading logic in python:
+// https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/saved_model/load.py#L437
+Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
+                             const TensorProto& proto,
+                             std::unique_ptr<Constant>* output);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SAVED_MODEL_UTILS_H_
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
new file mode 100644
index 00000000000..483162574f7
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
+
+#include <string.h>
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// Converts a tensorflow::DatatypeSet to std::vector<DataType>.
+// This is needed for GTest's ::testing::ValuesIn, since
+// DataTypeSet doesn't fullfill all the constraints of an STL-like iterable.
+std::vector<DataType> DataTypeSetToVector(DataTypeSet set) {
+  std::vector<DataType> result;
+  result.reserve(set.size());
+  for (DataType dt : set) {
+    result.push_back(dt);
+  }
+  return result;
+}
+
+// Returns a vector of shapes intended to be "interesting" test cases.
+std::vector<std::vector<int64>> InterestingShapes() {
+  std::vector<std::vector<int64>> interesting_shapes;
+  interesting_shapes.push_back({});             // Scalar
+  interesting_shapes.push_back({10});           // 1D Vector
+  interesting_shapes.push_back({3, 3});         // 2D Matrix
+  interesting_shapes.push_back({1, 4, 6, 10});  // Higher Dimension Tensor
+  return interesting_shapes;
+}
+
+// Fills a numeric tensor with `value`.
+void FillNumericTensor(Tensor* tensor, int8 value) {
+  switch (tensor->dtype()) {
+#define CASE(type)                                    \
+  case DataTypeToEnum<type>::value: {                 \
+    const auto& flattened = tensor->flat<type>();     \
+    for (int i = 0; i < tensor->NumElements(); ++i) { \
+      flattened(i) = value;                           \
+    }                                                 \
+    break;                                            \
+  }
+    TF_CALL_INTEGRAL_TYPES(CASE);
+    TF_CALL_double(CASE);
+    TF_CALL_float(CASE);
+#undef CASE
+    default:
+      CHECK(false) << "Unsupported data type: "
+                   << DataTypeString(tensor->dtype());
+      break;
+  }
+}
+
+// Checks the underlying data is equal for the buffers for two numeric tensors.
+// Note: The caller must ensure to check that the dtypes and sizes of the
+// underlying buffers are the same before calling this.
+void CheckBufferDataIsEqual(DataType dtype, int64 num_elements, void* a,
+                            void* b) {
+  switch (dtype) {
+#define CASE(type)                               \
+  case DataTypeToEnum<type>::value: {            \
+    type* typed_a = static_cast<type*>(a);       \
+    type* typed_b = static_cast<type*>(b);       \
+    for (int64 i = 0; i < num_elements; ++i) {   \
+      if (DataTypeIsFloating(dtype)) {           \
+        EXPECT_FLOAT_EQ(typed_a[i], typed_b[i]); \
+      } else {                                   \
+        EXPECT_EQ(typed_a[i], typed_b[i]);       \
+      }                                          \
+    }                                            \
+    break;                                       \
+  }
+    TF_CALL_INTEGRAL_TYPES(CASE);
+    TF_CALL_double(CASE);
+    TF_CALL_float(CASE);
+#undef CASE
+    default:
+      CHECK(false) << "Unsupported data type: " << DataTypeString(dtype);
+  }
+}
+
+class ConstantTest : public ::testing::TestWithParam<
+                         std::tuple<DataType, std::vector<int64>, bool>> {
+ public:
+  ConstantTest()
+      : device_mgr_(std::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
+            "CPU", {}, "/job:localhost/replica:0/task:0"))),
+        ctx_(new EagerContext(
+            SessionOptions(),
+            tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+            tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
+            /* async= */ false,
+            /* lazy_copy_function_remote_inputs= */ false, device_mgr_.get(),
+            /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+            /* custom_kernel_creator= */ nullptr,
+            /* cluster_flr= */ nullptr)) {}
+
+  EagerContext* context() { return ctx_.get(); }
+
+ private:
+  std::unique_ptr<StaticDeviceMgr> device_mgr_;
+  EagerContextPtr ctx_;
+};
+
+// Basic sanity check that roundtripping a Tensor->Tensorproto->Constant
+// preserves values.
+TEST_P(ConstantTest, CreateConstantSuccessful) {
+  // Get test parameters
+  auto& test_params = GetParam();
+  DataType dtype = std::get<0>(test_params);
+  TensorShape shape(std::get<1>(test_params));
+  bool tensorproto_use_tensor_content = std::get<2>(test_params);
+
+  // Construct a Tensor with the given dtype + shape
+  Tensor expected(dtype, shape);
+  FillNumericTensor(&expected, 42);
+
+  // Serialize it to a Tensorproto
+  TensorProto proto;
+  if (tensorproto_use_tensor_content) {
+    expected.AsProtoTensorContent(&proto);
+  } else {
+    expected.AsProtoField(&proto);
+  }
+
+  // Revival should succeed w/o errors
+  std::unique_ptr<Constant> revived;
+  TF_EXPECT_OK(internal::TensorProtoToConstant(context(), proto, &revived));
+
+  // The revived tensorhandle should have the exact same dtype, shape, +
+  // approx equivalent data to the original.
+  ImmediateExecutionTensorHandle* handle = revived->handle();
+  Status status;
+  AbstractTensorPtr revived_tensor(handle->Resolve(&status));
+  TF_EXPECT_OK(status) << "Failed to convert tensorhandle to tensor";
+  EXPECT_EQ(revived_tensor->Type(), expected.dtype());
+  EXPECT_EQ(revived_tensor->NumElements(), expected.NumElements());
+  EXPECT_EQ(revived_tensor->NumDims(), expected.dims());
+  for (int i = 0; i < expected.dims(); ++i) {
+    EXPECT_EQ(revived_tensor->Dim(i), expected.dim_size(i));
+  }
+
+  CheckBufferDataIsEqual(expected.dtype(), expected.NumElements(),
+                         revived_tensor->Data(), expected.data());
+}
+
+// Test against combinations of tensors that are
+// 1. Varying dtypes
+// 2. Varying shapes
+// 3. TensorProto serialized using tensor_content vs repeated type
+INSTANTIATE_TEST_SUITE_P(
+    ConstantIntegerDtypesTest, ConstantTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(DataTypeSetToVector(kDataTypeIsInteger)),
+        ::testing::ValuesIn(InterestingShapes()),
+        ::testing::Values(false, true)));
+
+INSTANTIATE_TEST_SUITE_P(
+    ConstantFloatingDtypesTest, ConstantTest,
+    ::testing::Combine(::testing::Values(DT_FLOAT, DT_DOUBLE),
+                       ::testing::ValuesIn(InterestingShapes()),
+                       ::testing::Values(false, true)));
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index eed796b4ec1..0c959e327a8 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
         "//tensorflow/core:regexp_internal",
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index a2cba5cdf9e..fe0d6d5a074 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "llvm-c/Target.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "tensorflow/compiler/aot/codegen.h"
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/aot/quantize.h"
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index fc0ff8d9445..eb31b23c991 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -476,10 +476,36 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     stream->ThenRecordEvent(definition_event.get());
   }
 
+  std::vector<TensorShape> output_tensor_shapes;
+  output_tensor_shapes.reserve(ctx->num_outputs());
+  if (output.on_host_shape().is_dynamic()) {
+    TF_ASSIGN_OR_RETURN(
+        auto transfer_manager,
+        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+
+    xla::Shape output_host_shape = output.on_host_shape();
+    xla::Shape output_device_shape = output.on_device_shape();
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
+        stream, &output, &output_host_shape, &output_device_shape));
+
+    output.set_shapes(output_host_shape, output_device_shape);
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      const xla::Shape& subshape =
+          xla::ShapeUtil::GetSubshape(output_host_shape, {i});
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(subshape, &shape));
+      output_tensor_shapes.push_back(shape);
+    }
+  } else {
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      output_tensor_shapes.push_back(compilation_result->outputs[i].shape);
+    }
+  }
+
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
-    const TensorShape& shape = compilation_result->outputs[i].shape;
+    const TensorShape& shape = output_tensor_shapes[i];
     const DataType& type = compilation_result->outputs[i].type;
     VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type "
             << DataTypeString(type);
diff --git a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
index 06c55abf1fa..2fe109c1783 100644
--- a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
+++ b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
@@ -24,10 +24,10 @@ the codegen input.
 
 ## Tasks
 
-              | Host                     | Device
-------------- | ------------------------ | ------------------------
-Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
-Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
+|               | Host                     | Device
+| ------------- | ------------------------ | ------------------------
+| Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
+| Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
 
 *   **Task 1** changes both host and device input format from HloInstruction* to
     LHLO.
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
index cf584987d2d..25bd761f99e 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
@@ -26,6 +26,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite:flatbuffer_to_string",
         "//tensorflow/compiler/mlir/lite:tf_tfl_translate",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt
new file mode 100644
index 00000000000..7b3a4d14fea
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/control_flow_v1.pbtxt
@@ -0,0 +1,257 @@
+# RUN: not tf_tfl_translate -tf-upgrade-legacy=false -tf-input-arrays=Placeholder,Placeholder_1 -tf-input-shapes=1,2:1 -tf-output-arrays=cond/Merge -tf-enable-shape-inference-on-import=false -mlir-print-debuginfo -output-mlir %s -o - 2>&1 | FileCheck %s
+
+# CHECK: error: The graph has Control Flow V1 ops. TFLite converter doesn't support Control Flow V1 ops. Consider using Control Flow V2 ops instead.
+
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\315\314\314=\315\314L>\232\231\231>\315\314\314>"
+      }
+    }
+  }
+}
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Placeholder_1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch"
+  op: "Switch"
+  input: "Placeholder_1"
+  input: "Placeholder_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_t"
+  op: "Identity"
+  input: "cond/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_f"
+  op: "Identity"
+  input: "cond/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/pred_id"
+  op: "Identity"
+  input: "Placeholder_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/MatMul"
+  op: "MatMul"
+  input: "cond/MatMul/Switch:1"
+  input: "cond/MatMul/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "cond/MatMul/Switch"
+  op: "Switch"
+  input: "Placeholder"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Placeholder"
+      }
+    }
+  }
+}
+node {
+  name: "cond/MatMul/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Add"
+  op: "Add"
+  input: "cond/Add/Switch"
+  input: "cond/Add/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "cond/Add/Switch"
+  op: "Switch"
+  input: "Placeholder"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Placeholder"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Add/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "cond/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Merge"
+  op: "Merge"
+  input: "cond/Add"
+  input: "cond/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "init"
+  op: "NoOp"
+}
+versions {
+  producer: 134
+}
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 31dad60c294..fcaebe82f74 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -172,7 +172,7 @@ int main(int argc, char **argv) {
         input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
         debug_info_file, input_arrays, input_dtypes, input_shapes,
         output_arrays,
-        /*prune_unused_nodes=*/true, &source_mgr, &context);
+        /*prune_unused_nodes=*/true, upgrade_legacy, &source_mgr, &context);
   }
 
   // If errors occur, the library call in the above already logged the error
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 38b96cf833f..2e45953c5fa 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
@@ -39,19 +41,47 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
-
+namespace {
 using mlir::MLIRContext;
 using mlir::ModuleOp;
+using mlir::Operation;
 using mlir::OwningModuleRef;
 using stream_executor::port::StatusOr;
 
+bool IsControlFlowV1Op(Operation* op) {
+  return mlir::isa<mlir::tf_executor::SwitchOp>(op) ||
+         mlir::isa<mlir::tf_executor::MergeOp>(op) ||
+         mlir::isa<mlir::tf_executor::EnterOp>(op) ||
+         mlir::isa<mlir::tf_executor::ExitOp>(op) ||
+         mlir::isa<mlir::tf_executor::NextIterationSinkOp>(op) ||
+         mlir::isa<mlir::tf_executor::NextIterationSourceOp>(op);
+}
+
+mlir::LogicalResult IsValidGraph(mlir::ModuleOp module) {
+  auto result = module.walk([&](Operation* op) {
+    return IsControlFlowV1Op(op) ? mlir::WalkResult::interrupt()
+                                 : mlir::WalkResult::advance();
+  });
+  if (result.wasInterrupted()) {
+    module.emitError(
+        "The graph has Control Flow V1 ops. TFLite converter doesn't support "
+        "Control Flow V1 ops. Consider using Control Flow V2 ops instead. See "
+        "https://www.tensorflow.org/api_docs/python/tf/compat/v1/"
+        "enable_control_flow_v2.");
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+}  // namespace
+
 StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
     absl::string_view debug_info_file, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, bool prune_unused_nodes,
-    llvm::SourceMgr* source_mgr, MLIRContext* context) {
+    bool enable_upgrade_legacy, llvm::SourceMgr* source_mgr,
+    MLIRContext* context) {
   // Set up the input file.
   std::string error_message;
   auto file = mlir::openInputFile(input_filename, &error_message);
@@ -86,14 +116,14 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
         file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
         input_shapes, output_arrays, /*control_output_arrays=*/"",
         prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-        /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+        /*graph_as_function=*/false, enable_upgrade_legacy,
         /*enable_shape_inference=*/false, context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
       file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
       input_shapes, output_arrays, /*control_output_arrays=*/"",
       prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
-      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*graph_as_function=*/false, enable_upgrade_legacy,
       /*enable_shape_inference=*/false, context);
 }
 
@@ -104,7 +134,8 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::PassManager* pass_manager) {
   mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
                                                     /*propagate=*/true);
-  if (failed(pass_manager->run(module))) {
+
+  if (failed(IsValidGraph(module)) || failed(pass_manager->run(module))) {
     return statusHandler.ConsumeStatus();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index d2c31a6b972..82cf9c9549b 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -41,7 +41,8 @@ LoadFromGraphdefOrMlirSource(
     absl::string_view debug_info_file, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, bool prune_unused_nodes,
-    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+    bool enable_upgrade_legacy, llvm::SourceMgr* source_mgr,
+    mlir::MLIRContext* context);
 
 // Load Saved model (either v1 or v2) into MLIR.
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index b159815d5eb..db31d4faf5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1356,6 +1356,7 @@ cc_library(
     srcs = ["utils/tpu_rewrite_device_util.cc"],
     hdrs = ["utils/tpu_rewrite_device_util.h"],
     deps = [
+        ":tensorflow",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
@@ -1366,6 +1367,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1374,6 +1376,7 @@ tf_cc_test(
     size = "small",
     srcs = ["utils/tpu_rewrite_device_util_test.cc"],
     deps = [
+        ":device_util",
         ":tpu_rewrite_device_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
index 594afa10453..95ad05aa1e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
@@ -4,8 +4,6 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "lit_test")
 
 def tf_saved_model_test(name, data, tags = None):
     """Create a SavedModel test."""
-    if tags == None:
-        tags = ["no_rocm"]
     native.py_binary(
         name = name,
         testonly = 1,
@@ -26,5 +24,5 @@ def tf_saved_model_test(name, data, tags = None):
         name = name + ".py",
         data = [name] + data,
         driver = "@llvm-project//mlir:run_lit.sh",
-        tags = tags,
+        tags = tags + ["no_rocm"],
     )
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index bdfe43fc9cb..2be6ee7a78c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -113,64 +113,6 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
   return launch;
 }
 
-// Parses TPU compilation and execution devices from a TPU cluster and returns
-// the host device for the head and tail computations. If the TPU computation is
-// replicated, kTPUReplicatedHost is returned instead.
-LogicalResult GetHostDeviceForHeadTailComputation(
-    mlir::TF::RuntimeDevices devices, tf_device::ClusterOp cluster,
-    std::string* host_device) {
-  auto replicate = cluster.getParentOfType<tf_device::ReplicateOp>();
-  if (replicate) {
-    *host_device = tensorflow::kTPUReplicatedHost;
-    return success();
-  }
-
-  auto num_cores_per_replica_attr =
-      cluster.getAttrOfType<IntegerAttr>(tensorflow::kNumCoresPerReplicaAttr);
-  if (!num_cores_per_replica_attr)
-    return cluster.emitOpError(
-        "cluster op missing `num_cores_per_replica` attribute");
-
-  if (num_cores_per_replica_attr.getInt() != 1)
-    return cluster.emitOpError(
-        "outside compilation is not supported with model parallelism.");
-
-  auto topology_attr =
-      cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
-  if (!topology_attr)
-    return cluster.emitOpError("cluster op missing `topology` attribute");
-
-  auto device_assignment_attr =
-      cluster.getAttrOfType<mlir::ArrayAttr>(tensorflow::kDeviceAssignmentAttr);
-  if (!device_assignment_attr)
-    return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
-                                             tensorflow::kDeviceAssignmentAttr)
-                                   .str());
-
-  auto status_or_device_coodinates =
-      tensorflow::GetDeviceCoordinates(device_assignment_attr);
-
-  if (!status_or_device_coodinates.ok())
-    return cluster.emitError()
-           << "error in fetching tpu device coordinates: "
-           << status_or_device_coodinates.status().error_message();
-
-  // Determine compilation and execution devices.
-  auto status_or_tpu_device_assignment =
-      tensorflow::GetTPUCompilationAndExecutionDevices(
-          devices.device_names(), /*num_replicas=*/1,
-          /*num_cores_per_replica=*/1, topology_attr.getValue(),
-          status_or_device_coodinates.ConsumeValueOrDie());
-  if (!status_or_tpu_device_assignment.ok())
-    return cluster.emitError()
-           << "error in fetching TPU compilation/execution devices: "
-           << status_or_tpu_device_assignment.status().error_message();
-  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
-
-  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
-  return success();
-}
-
 // Returns a set of ops that are outside compiled and can be extracted to before
 // the TPU computation. These ops are either connected to the inputs of the TPU
 // computation or other ops that can be extracted, and have no operands from
@@ -232,8 +174,8 @@ mlir::LogicalResult LiftHeadOutsideCompiledOps(
   llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
       FindOutsideCompiledOpsAtHead(cluster);
   if (head_outside_compiled_ops.empty()) return success();
-  if (failed(
-          GetHostDeviceForHeadTailComputation(devices, cluster, host_device)))
+  if (failed(tensorflow::GetHostDeviceOutsideComputation(devices, cluster,
+                                                         host_device)))
     return failure();
 
   CreateHeadComputation(builder, cluster, head_outside_compiled_ops,
@@ -361,8 +303,8 @@ mlir::LogicalResult LiftTailOutsideCompiledOps(
   if (tail_outside_compiled_ops.empty()) return success();
 
   if (host_device.empty())
-    if (failed(GetHostDeviceForHeadTailComputation(devices, *cluster,
-                                                   &host_device)))
+    if (failed(tensorflow::GetHostDeviceOutsideComputation(devices, *cluster,
+                                                           &host_device)))
       return failure();
 
   // Forward all results of cluster first. These results will be remapped once
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 282b7ad3139..f884b75bce1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -484,4 +484,59 @@ std::string GetDeviceAliasForLogicalCore(int core_index) {
   return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index).str();
 }
 
+mlir::LogicalResult GetHostDeviceOutsideComputation(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    std::string* host_device) {
+  auto replicate = cluster.getParentOfType<mlir::tf_device::ReplicateOp>();
+  if (replicate) {
+    *host_device = tensorflow::kTPUReplicatedHost;
+    return mlir::success();
+  }
+
+  auto num_cores_per_replica_attr = cluster.getAttrOfType<mlir::IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
+  if (!num_cores_per_replica_attr)
+    return cluster.emitOpError(
+        "cluster op missing `num_cores_per_replica` attribute");
+
+  if (num_cores_per_replica_attr.getInt() != 1)
+    return cluster.emitOpError(
+        "outside compilation is not supported with model parallelism.");
+
+  auto topology_attr =
+      cluster.getAttrOfType<mlir::StringAttr>(tensorflow::kTopologyAttr);
+  if (!topology_attr)
+    return cluster.emitOpError("cluster op missing `topology` attribute");
+
+  auto device_assignment_attr =
+      cluster.getAttrOfType<mlir::ArrayAttr>(tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
+                                             tensorflow::kDeviceAssignmentAttr)
+                                   .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+
+  if (!status_or_device_coodinates.ok())
+    return cluster.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
+
+  // Determine compilation and execution devices.
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices.device_names(), /*num_replicas=*/1,
+          /*num_cores_per_replica=*/1, topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
+  if (!status_or_tpu_device_assignment.ok())
+    return cluster.emitError()
+           << "error in fetching TPU compilation/execution devices: "
+           << status_or_tpu_device_assignment.status().error_message();
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
+
+  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
+  return mlir::success();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 6bb541ab683..96cc8d7877b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -237,6 +240,13 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
 // logical core.
 std::string GetDeviceAliasForLogicalCore(int core_index);
 
+// Parses TPU compilation and execution devices from a TPU cluster and returns
+// the host device for the head and tail computations. If the TPU computation is
+// replicated, kTPUReplicatedHost is returned instead.
+mlir::LogicalResult GetHostDeviceOutsideComputation(
+    mlir::TF::RuntimeDevices devices, mlir::tf_device::ClusterOp cluster,
+    std::string* host_device);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index a70e93a0195..49a8f704b30 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -622,5 +624,185 @@ TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
             "bad 'device_assignment' attribute at index 0, not an int");
 }
 
+TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+
+  mlir::TF::RuntimeDevices devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailModelParallelism) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 5));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingTopology) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingDeviceAssignment) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceAssignment) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr,
+                  builder.getStrArrayAttr(llvm::ArrayRef<llvm::StringRef>(
+                      {"bad_device_assigment"})));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceName) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+  module_ref->setAttr(
+      "tf.devices", builder.getStrArrayAttr(
+                        llvm::ArrayRef<llvm::StringRef>({"bad_device_name"})));
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  GetDevicesFromOp(*module_ref, &runtime_devices);
+  std::string host_device;
+  EXPECT_TRUE(mlir::failed(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+
+  llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<llvm::StringRef, 4>>
+      devices;
+  auto replicate = builder.create<mlir::tf_device::ReplicateOp>(
+      mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
+      llvm::ArrayRef<std::pair<llvm::ArrayRef<mlir::Value>, mlir::Type>>{},
+      llvm::ArrayRef<mlir::Type>{});
+  builder.setInsertionPoint(&replicate.body().front(),
+                            replicate.body().front().begin());
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  std::string host_device;
+  EXPECT_TRUE(mlir::succeeded(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+  EXPECT_EQ(host_device, kTPUReplicatedHost);
+}
+
+TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
+  mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+  mlir::OpBuilder builder(module_ref->getBodyRegion());
+  module_ref->setAttr(
+      "tf.devices", builder.getStrArrayAttr(llvm::ArrayRef<llvm::StringRef>(
+                        {"/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0",
+                         "/job:localhost/replica:0/task:0/device:TPU:0",
+                         "/job:worker/replica:0/task:0/device:CPU:0"})));
+
+  llvm::SmallVector<mlir::Type, 8> result_types;
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
+      mlir::UnknownLoc::get(&context), result_types);
+  cluster.setAttr(kNumCoresPerReplicaAttr,
+                  builder.getIntegerAttr(builder.getIntegerType(64), 1));
+  cluster.setAttr(kTopologyAttr, builder.getStringAttr(""));
+  cluster.setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
+
+  mlir::TF::RuntimeDevices runtime_devices;
+  GetDevicesFromOp(*module_ref, &runtime_devices);
+  std::string host_device;
+  EXPECT_TRUE(mlir::succeeded(
+      GetHostDeviceOutsideComputation(runtime_devices, cluster, &host_device)));
+  EXPECT_EQ(host_device, "/job:localhost/replica:0/task:0/device:CPU:0");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 78a77dc3b4d..99d2c08aa98 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -129,7 +129,7 @@ struct DynamicMemRefCastOpConverter
 void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
   patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
-      *converter, LowerToLLVMOptions());
+      *converter);
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 789309bb3bc..07a41d67520 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -229,16 +229,16 @@ class BinaryOpsTest(xla_test.XLATestCase):
         self._testBinary(
             gen_math_ops.xdivy,
             np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
-            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
-            expected=np.array([0, 0.8, 0.5, 0.285714, 0.125, 0], dtype=dtype),
+            np.array([[0, 5, 6, 7, 8, float("NaN")]], dtype=dtype),
+            expected=np.array([[0, 0.8, 0.5, 0.285714, 0.125, 0]], dtype=dtype),
             rtol=1e-6,
             atol=1e-6)
 
         self._testBinary(
             gen_math_ops.xlogy,
             np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
-            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
-            expected=np.array([0, 6.437752, 5.375278, 3.89182, 2.079442, 0],
+            np.array([[0, 5, 6, 7, 8, float("NaN")]], dtype=dtype),
+            expected=np.array([[0, 6.437752, 5.375278, 3.89182, 2.079442, 0]],
                               dtype=dtype),
             rtol=1e-4,
             atol=1e-6)
@@ -246,8 +246,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
         self._testBinary(
             gen_math_ops.xlog1py,
             np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
-            np.array([-1, 5, 6, 7, 8, float("NaN")], dtype=dtype),
-            expected=np.array([0, 7.167038, 5.837730, 4.158883, 2.197225, 0],
+            np.array([[-1, 5, 6, 7, 8, float("NaN")]], dtype=dtype),
+            expected=np.array([[0, 7.167038, 5.837730, 4.158883, 2.197225, 0]],
                               dtype=dtype),
             rtol=1e-4,
             atol=1e-6)
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 0ea851e9325..88d7525e5d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -153,6 +153,7 @@ XLA_MAKE_BINARY(Xlogy, XlogyImpl(lhs, rhs, broadcast_helper));
 
 xla::XlaOp Xlog1pyImpl(xla::XlaOp x, xla::XlaOp y,
                        const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto non_zero = xla::Mul(x, xla::Log1p(y));
   auto zero = xla::ZerosLike(non_zero);
   auto x_is_zero = xla::Eq(x, zero);
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 405c5e787da..66545fc72cf 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -85,8 +86,20 @@ XLAJIT_MAKE_UNARY(Rsqrt, xla::Rsqrt(x));
 XLAJIT_MAKE_UNARY(Sigmoid, xla::Logistic(x));
 
 // Returns 0 if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-XLAJIT_MAKE_UNARY(Sign,
-                  xla::Select(xla::Ne(x, x), xla::ZerosLike(x), xla::Sign(x)));
+static xla::XlaOp Sign(xla::XlaBuilder* b, xla::XlaOp x) {
+  return b->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+    if (xla::primitive_util::IsComplexType(shape.element_type())) {
+      return xla::Sign(x);
+    }
+    auto gt = xla::Gt(x, xla::ZerosLike(x));
+    auto lt = xla::Lt(x, xla::ZerosLike(x));
+    return xla::ConvertElementType(gt, shape.element_type()) -
+           xla::ConvertElementType(lt, shape.element_type());
+  });
+}
+
+XLAJIT_MAKE_UNARY(Sign, Sign(b, x));
 XLAJIT_MAKE_UNARY(Sinh, xla::Sinh(x));
 
 static xla::XlaOp Softplus(xla::XlaBuilder* b, xla::XlaOp features) {
diff --git a/tensorflow/compiler/xla/g3doc/tiled_layout.md b/tensorflow/compiler/xla/g3doc/tiled_layout.md
index 21e88ceab62..b40f0a95a3d 100644
--- a/tensorflow/compiler/xla/g3doc/tiled_layout.md
+++ b/tensorflow/compiler/xla/g3doc/tiled_layout.md
@@ -3,9 +3,10 @@
 Caution: Tiled layout is *pre-release* and this describes how it's intended to
 work. Errors may be silently ignored.
 
-<center> ![](images/xla_array_layout_figure1.png)
-
-Figure 1 </center>
+<p align="center">
+  <img src="images/xla_array_layout_figure1.png">
+  Figure 1
+</p>
 
 Figure 1 shows how an array F32[3,5] is laid out in memory with 2x2 tiling. A
 shape with this layout is written as F32[3,5]{1,0:(2,2)}, where 1,0 relates to
@@ -120,9 +121,10 @@ element follows the formula above as expected.
 
 XLA's tiling becomes even more flexible by applying it repeatedly.
 
-<center> ![](images/xla_array_layout_figure2.png)
-
-Figure 2 </center>
+<p align="center">
+  <img src="images/xla_array_layout_figure2.png">
+  Figure 2
+</p>
 
 Figure 2 shows how an array of size 4x8 is tiled by two levels of tiling (first
 2x4 then 2x1). We represent this repeated tiling as (2,4)(2,1). Each color
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2fd457e8e47..10e2d7e65d1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1202,6 +1202,9 @@ cc_library(
     srcs = ["transfer_manager.cc"],
     hdrs = ["transfer_manager.h"],
     deps = [
+        ":compiler",
+        ":executable",
+        ":maybe_owning_device_memory",
         ":shaped_buffer",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1210,8 +1213,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index ebb0226476f..0fd64209152 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -33,6 +34,7 @@ limitations under the License.
 using absl::StrCat;
 
 namespace xla {
+
 /* static */ tensorflow::mutex
     TransferManager::platform_transfer_manager_mutex_(
         tensorflow::LINKER_INITIALIZED);
@@ -200,6 +202,67 @@ void TransferManager::TransferArrayFromDevice(
                                    std::move(done), transfer_metadata);
 }
 
+Status TransferManager::ReadDynamicShapes(se::Stream* stream,
+                                          ShapedBuffer* device_buffer,
+                                          Shape* host_shape,
+                                          Shape* device_shape) {
+  DCHECK(device_shape->is_dynamic());
+  Shape original_device_shape = *device_shape;
+  Shape original_host_shape = *host_shape;
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  TF_ASSIGN_OR_RETURN(auto compiler,
+                      Compiler::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const Shape& buffer_shape =
+            ShapeUtil::GetSubshape(*device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        Shape& host_sub_shape =
+            *ShapeUtil::GetMutableSubshape(host_shape, index);
+        Shape& device_sub_shape =
+            *ShapeUtil::GetMutableSubshape(device_shape, index);
+        if (device_sub_shape.is_static()) {
+          return Status::OK();
+        }
+
+        // Read the dynamic shape metadata from the device stream.
+        auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+        Shape buffer_shape_static = ShapeUtil::MakeStaticShape(buffer_shape);
+        const int64 offset = shape_size_fn(buffer_shape_static);
+        int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+        if (metadata_size == 0) {
+          return InvalidArgument("Dynamic shape metadata size should not be 0");
+        }
+        auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+        auto metadata_buffer =
+            stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+        TF_ASSIGN_OR_RETURN(
+            auto metadata,
+            TransferArrayFromDevice(
+                stream,
+                ShapeUtil::MakeShape(S32, {buffer_shape.dimensions_size()}),
+                metadata_buffer));
+
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_sub_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_sub_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  host_shape->clear_dynamic_dimensions();
+  device_shape->clear_dynamic_dimensions();
+
+  TF_RET_CHECK(ShapeUtil::DynamicShapeIsCompatible(*device_shape,
+                                                   original_device_shape));
+  TF_RET_CHECK(
+      ShapeUtil::DynamicShapeIsCompatible(*host_shape, original_host_shape));
+  return Status::OK();
+}
+
 /* static */ void TransferManager::RegisterTransferManager(
     se::Platform::Id platform_id,
     TransferManagerCreationFunction creation_function) {
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index e3f8ceacc42..c0670d26eee 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -184,6 +184,15 @@ class TransferManager {
       const se::DeviceMemoryBase& source,
       const TransferMetadata* transfer_metadata = nullptr);
 
+  // Read from a device buffer and update the dynamic dimension sizes of
+  // `host_shape` and `device_shape`. The function takes in bounded dynamic
+  // shapes, and returns static shapes with dynamic shapes updated.
+  // The shape of the buffer also have to be compatible with the host shape and
+  // device shape.
+  virtual Status ReadDynamicShapes(se::Stream* stream,
+                                   ShapedBuffer* device_buffer,
+                                   Shape* host_shape, Shape* device_shape);
+
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 5c21121b98e..e8b6105d3fe 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -120,7 +120,7 @@ enum Format {
 }
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/tiled_layout.md for
 // details about tiling-based layout.
 message TileProto {
   // Number of elements in each dimension of the tile. It's ordered from the
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 3bd8af577c8..bfd48bd1442 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -264,86 +264,28 @@ Status UpdateDynamicInputs(
   return Status::OK();
 }
 
-xla::StatusOr<xla::Literal> ReadMetadataLiteral(
-    se::Stream* stream, se::DeviceMemoryBase buffer,
-    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
-  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
-                                         stream->parent()->platform()));
-  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
-  xla::Shape buffer_shape_static =
-      xla::ShapeUtil::MakeStaticShape(buffer_shape);
-  const int64 offset = shape_size_fn(buffer_shape_static);
-  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
-  TF_RET_CHECK(metadata_size != 0);
-  auto buffer_8 = se::DeviceMemory<uint8>(buffer);
-  auto metadata_buffer =
-      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
-  return transfer_manager->TransferArrayFromDevice(
-      stream,
-      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
-      metadata_buffer);
-}
-
-// For each subshape in the result buffer that's dynamic, read the dynamic
-// dimension sizes from the metadata, and update output shapes. The result shape
-// is a static and concrete shape.
-xla::Status UpdateDynamicOutputs(se::Stream* stream,
-                                 const xla::ShapedBuffer& shaped_buffer,
-                                 xla::Shape* output_host_shape,
-                                 xla::Shape* output_device_shape) {
-  DCHECK(output_device_shape->is_dynamic());
-  TF_ASSIGN_OR_RETURN(
-      auto transfer_manager,
-      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachElementWithStatus(
-      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
-        const xla::Shape& buffer_shape =
-            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
-        if (buffer_shape.IsTuple()) {
-          return Status::OK();
-        }
-        xla::Shape& host_shape =
-            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
-        xla::Shape& device_shape =
-            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
-        if (device_shape.is_static()) {
-          return Status::OK();
-        }
-        TF_ASSIGN_OR_RETURN(auto metadata,
-                            ReadMetadataLiteral(stream, buffer, buffer_shape,
-                                                transfer_manager));
-        // Update shape size from metadata.
-        for (int64 i = 0; i < metadata.element_count(); ++i) {
-          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
-          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
-        }
-        return Status::OK();
-      }));
-  output_host_shape->clear_dynamic_dimensions();
-  output_device_shape->clear_dynamic_dimensions();
-  return Status::OK();
-}
-
 xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
     se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend,
     int device_ordinal) {
   XRTTupleAllocation* output_tuple;
-  const xla::ScopedShapedBuffer& shaped_buffer = run_result.Result();
-  if (shaped_buffer.on_device_shape().is_dynamic()) {
+  xla::ScopedShapedBuffer* shaped_buffer = run_result.MutableResult();
+  if (shaped_buffer->on_device_shape().is_dynamic()) {
     // Update dynamic shapes from output buffer, and create a XRT tensor with
     // dimension sizes read from metadata.
-    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
-    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
-    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
+    xla::Shape output_host_shape = shaped_buffer->on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer->on_device_shape();
+    TF_ASSIGN_OR_RETURN(
+        auto transfer_manager,
+        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
         stream, shaped_buffer, &output_host_shape, &output_device_shape));
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        shaped_buffer, output_host_shape, output_device_shape, backend,
+        *shaped_buffer, output_host_shape, output_device_shape, backend,
         device_ordinal, &output_tuple));
   } else {
     // Fast-path: Don't copy shapes of output buffer.
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        shaped_buffer, backend, device_ordinal, &output_tuple));
+        *shaped_buffer, backend, device_ordinal, &output_tuple));
   }
   // After the output tuple is created, we can release the output result
   // buffers, to make sure they won't be cleared by its destructor.
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index 77d2b665f5e..0191527748b 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -197,7 +197,7 @@ Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
   if (device == kVariantDeviceNull) {
     bool pin_to_cpu;
     TF_RETURN_IF_ERROR(eager::MaybePinSmallOpsToCpu(
-        &pin_to_cpu, op_name(),
+        &pin_to_cpu, Name(),
         absl::MakeSpan(
             reinterpret_cast<ImmediateExecutionTensorHandle**>(inputs_.data()),
             inputs_.size()),
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 6e17cdf4316..9d662956504 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -764,8 +764,8 @@ Tensor GetResourceHandle(const string& var_name, const string& container,
   handle.set_device(device_name);
   handle.set_container(container);
   handle.set_name(var_name);
-  handle.set_hash_code(MakeTypeIndex<Var>().hash_code());
-  handle.set_maybe_type_name(MakeTypeIndex<Var>().name());
+  handle.set_hash_code(TypeIndex::Make<Var>().hash_code());
+  handle.set_maybe_type_name(TypeIndex::Make<Var>().name());
   Tensor tensor(DT_RESOURCE, TensorShape({}));
   tensor.scalar<ResourceHandle>()() = handle;
   return tensor;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index b0e4eace16e..3af8d81b0dc 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -301,7 +301,7 @@ ResourceHandle MakeResourceHandle(
   return MakeResourceHandle(
       container.empty() ? ctx->resource_manager()->default_container()
                         : container,
-      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
+      name, *ctx->device(), TypeIndex::Make<T>(), dtypes_and_shapes);
 }
 
 template <typename T>
@@ -311,7 +311,7 @@ ResourceHandle MakeResourceHandle(
   return MakeResourceHandle(
       container.empty() ? ctx->resource_manager()->default_container()
                         : container,
-      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
+      name, *ctx->device(), TypeIndex::Make<T>(), dtypes_and_shapes);
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
@@ -589,7 +589,7 @@ Status ResourceMgr::Create(const string& container, const string& name,
   CheckDeriveFromResourceBase<T>();
   CHECK(resource != nullptr);
   mutex_lock l(mu_);
-  return DoCreate(container, MakeTypeIndex<T>(), name, resource);
+  return DoCreate(container, TypeIndex::Make<T>(), name, resource);
 }
 
 template <typename T, bool use_dynamic_cast>
@@ -635,7 +635,7 @@ template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupInternal(const string& container, const string& name,
                                    T** resource) const {
   ResourceBase* found = nullptr;
-  Status s = DoLookup(container, MakeTypeIndex<T>(), name, &found);
+  Status s = DoLookup(container, TypeIndex::Make<T>(), name, &found);
   if (s.ok()) {
     // It's safe to down cast 'found' to T* since
     // typeid(T).hash_code() is part of the map key.
@@ -660,7 +660,7 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
   s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
   if (s.ok()) return s;
   TF_RETURN_IF_ERROR(creator(resource));
-  s = DoCreate(container, MakeTypeIndex<T>(), name, *resource);
+  s = DoCreate(container, TypeIndex::Make<T>(), name, *resource);
   if (!s.ok()) {
     return errors::Internal("LookupOrCreate failed unexpectedly");
   }
@@ -671,7 +671,7 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
 template <typename T>
 Status ResourceMgr::Delete(const string& container, const string& name) {
   CheckDeriveFromResourceBase<T>();
-  return DoDelete(container, MakeTypeIndex<T>(), name);
+  return DoDelete(container, TypeIndex::Make<T>(), name);
 }
 
 template <typename T>
@@ -710,7 +710,7 @@ Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p);
 template <typename T>
 Status ValidateDeviceAndType(OpKernelContext* ctx, const ResourceHandle& p) {
   TF_RETURN_IF_ERROR(internal::ValidateDevice(ctx, p));
-  auto type_index = MakeTypeIndex<T>();
+  auto type_index = TypeIndex::Make<T>();
   if (type_index.hash_code() != p.hash_code()) {
     return errors::InvalidArgument(
         "Trying to access resource using the wrong type. Expected ",
@@ -883,7 +883,7 @@ ResourceHandle ScopedStepContainer::MakeResourceHandle(
   mutex_lock ml(mu_);
   dirty_ = true;
   return tensorflow::MakeResourceHandle(container_, name, device,
-                                        MakeTypeIndex<T>(), {});
+                                        TypeIndex::Make<T>(), {});
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index d8ee52a0e5d..4cb732ae973 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -105,7 +105,7 @@ class ResourceOpKernel : public OpKernel {
     if (has_resource_type_) {
       OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                                   context, 0, cinfo_.container(), cinfo_.name(),
-                                  MakeTypeIndex<T>()));
+                                  TypeIndex::Make<T>()));
     } else {
       context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 1a7812ce4eb..313451d6b83 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -42,11 +42,15 @@ void ExpectClose(const Tensor& x, const Tensor& y, double atol, double rtol) {
       << "typed_atol is negative: " << typed_atol;
   ASSERT_GE(typed_rtol, static_cast<RealType>(0.0))
       << "typed_rtol is negative: " << typed_rtol;
+  const int max_failures = 10;
+  int num_failures = 0;
   for (int i = 0; i < size; ++i) {
     EXPECT_TRUE(
         internal::Helper<T>::IsClose(Tx[i], Ty[i], typed_atol, typed_rtol))
-        << "index = " << i << " x = " << Tx[i] << " y = " << Ty[i]
-        << " typed_atol = " << typed_atol << " typed_rtol = " << typed_rtol;
+        << "index = " << (++num_failures, i) << " x = " << Tx[i]
+        << " y = " << Ty[i] << " typed_atol = " << typed_atol
+        << " typed_rtol = " << typed_rtol;
+    ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up.";
   }
 }
 
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index fcf68677a12..e8f715bebda 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -95,11 +95,6 @@ class TypeIndex {
   const char* name_;
 };
 
-template <typename T>
-inline TypeIndex MakeTypeIndex() {
-  return TypeIndex::Make<T>();
-}
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index 3200d7c81fa..e8a0c332968 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -144,7 +144,7 @@ void EncodeVariant(const T& value, string* buf);
 //   Variant y_type_unknown = serialized_proto_f;  // Store serialized Variant.
 //
 //   EXPECT_EQ(x.TypeName(), y_type_unknown.TypeName());  // Looks like Foo.
-//   EXPECT_EQ(MakeTypeIndex<VariantTensorDataProto>(),
+//   EXPECT_EQ(TypeIndex::Make<VariantTensorDataProto>(),
 //             y_type_unknown.TypeId());
 //
 class Variant {
@@ -227,7 +227,7 @@ class Variant {
   // of the original type when a TensorValueDataProto is stored as the
   // value.  In this case, it returns the TypeIndex of TensorValueDataProto.
   TypeIndex TypeId() const {
-    const TypeIndex VoidTypeIndex = MakeTypeIndex<void>();
+    const TypeIndex VoidTypeIndex = TypeIndex::Make<void>();
     if (is_empty()) {
       return VoidTypeIndex;
     }
@@ -244,7 +244,7 @@ class Variant {
   // otherwise.
   template <typename T>
   T* get() {
-    const TypeIndex TTypeIndex = MakeTypeIndex<T>();
+    const TypeIndex TTypeIndex = TypeIndex::Make<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
     return std::addressof(static_cast<Variant::Value<T>*>(GetValue())->value);
   }
@@ -253,7 +253,7 @@ class Variant {
   // otherwise.
   template <typename T>
   const T* get() const {
-    const TypeIndex TTypeIndex = MakeTypeIndex<T>();
+    const TypeIndex TTypeIndex = TypeIndex::Make<T>();
     if (is_empty() || (TTypeIndex != TypeId())) return nullptr;
     return std::addressof(
         static_cast<const Variant::Value<T>*>(GetValue())->value);
@@ -333,7 +333,7 @@ class Variant {
 
     TypeIndex TypeId() const final {
       const TypeIndex value_type_index =
-          MakeTypeIndex<typename std::decay<T>::type>();
+          TypeIndex::Make<typename std::decay<T>::type>();
       return value_type_index;
     }
 
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 5e08e5a7a60..502bbd57422 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -160,7 +160,7 @@ string TypeNameVariantImpl(
     const T& value,
     TypeNameResolver<T, false /* has_type_name */, false /* Tensor */,
                      false /* protobuf */>) {
-  return port::MaybeAbiDemangle(MakeTypeIndex<T>().name());
+  return port::MaybeAbiDemangle(TypeIndex::Make<T>().name());
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 4d94dcd35dd..5879597e5eb 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -521,7 +521,7 @@ class UnaryVariantBinaryOpRegistration {
 #define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(T, direction,   \
                                                              device_copy_fn) \
   INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ_HELPER(          \
-      __COUNTER__, T, direction, MakeTypeIndex<T>(), device_copy_fn)
+      __COUNTER__, T, direction, TypeIndex::Make<T>(), device_copy_fn)
 
 #define INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION_UNIQ_HELPER( \
     ctr, T, direction, type_index, device_copy_fn)                        \
@@ -542,7 +542,7 @@ class UnaryVariantBinaryOpRegistration {
 #define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(op, device, T,     \
                                                  unary_op_function) \
   REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ_HELPER(             \
-      __COUNTER__, op, device, T, MakeTypeIndex<T>(), unary_op_function)
+      __COUNTER__, op, device, T, TypeIndex::Make<T>(), unary_op_function)
 
 #define REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION_UNIQ_HELPER(       \
     ctr, op, device, T, type_index, unary_op_function)              \
@@ -563,7 +563,7 @@ class UnaryVariantBinaryOpRegistration {
 #define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(op, device, T,      \
                                                   binary_op_function) \
   REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ_HELPER(              \
-      __COUNTER__, op, device, T, MakeTypeIndex<T>(), binary_op_function)
+      __COUNTER__, op, device, T, TypeIndex::Make<T>(), binary_op_function)
 
 #define REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION_UNIQ_HELPER( \
     ctr, op, device, T, type_index, binary_op_function)        \
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 0aef6154a1f..1c45a39770c 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -155,12 +155,12 @@ TEST(VariantOpCopyToGPURegistryTest, TestBasic) {
   // No registered copy fn for GPU<->GPU.
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetDeviceCopyFn(
                 VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
-                MakeTypeIndex<VariantValue>()),
+                TypeIndex::Make<VariantValue>()),
             nullptr);
 
   auto* copy_to_gpu_fn = UnaryVariantOpRegistry::Global()->GetDeviceCopyFn(
       VariantDeviceCopyDirection::HOST_TO_DEVICE,
-      MakeTypeIndex<VariantValue>());
+      TypeIndex::Make<VariantValue>());
   EXPECT_NE(copy_to_gpu_fn, nullptr);
 
   VariantValue vv{true /* early_exit */};
@@ -183,7 +183,7 @@ TEST(VariantOpCopyToGPURegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::AsyncVariantDeviceCopyFn f;
   class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
+  const auto kTypeIndex = TypeIndex::Make<FjFjFj>();
   registry.RegisterDeviceCopyFn(VariantDeviceCopyDirection::HOST_TO_DEVICE,
                                 kTypeIndex, f);
   EXPECT_DEATH(registry.RegisterDeviceCopyFn(
@@ -193,9 +193,10 @@ TEST(VariantOpCopyToGPURegistryTest, TestDuplicate) {
 
 TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
   class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
-                ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, MakeTypeIndex<Blah>()),
-            nullptr);
+  EXPECT_EQ(
+      UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
+          ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, TypeIndex::Make<Blah>()),
+      nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 0 /* value */};
   Variant v = vv_early_exit;
@@ -218,9 +219,10 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
   class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
-                ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_GPU, MakeTypeIndex<Blah>()),
-            nullptr);
+  EXPECT_EQ(
+      UnaryVariantOpRegistry::Global()->GetUnaryOpFn(
+          ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_GPU, TypeIndex::Make<Blah>()),
+      nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 0 /* value */};
   Variant v = vv_early_exit;
@@ -245,7 +247,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantUnaryOpFn f;
   class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
+  const auto kTypeIndex = TypeIndex::Make<FjFjFj>();
 
   registry.RegisterUnaryOpFn(ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU,
                              kTypeIndex, f);
@@ -263,7 +265,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestDuplicate) {
 TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   class Blah {};
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetBinaryOpFn(
-                ADD_VARIANT_BINARY_OP, DEVICE_CPU, MakeTypeIndex<Blah>()),
+                ADD_VARIANT_BINARY_OP, DEVICE_CPU, TypeIndex::Make<Blah>()),
             nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 3 /* value */};
@@ -290,7 +292,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
 TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   class Blah {};
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetBinaryOpFn(
-                ADD_VARIANT_BINARY_OP, DEVICE_GPU, MakeTypeIndex<Blah>()),
+                ADD_VARIANT_BINARY_OP, DEVICE_GPU, TypeIndex::Make<Blah>()),
             nullptr);
 
   VariantValue vv_early_exit{true /* early_exit */, 3 /* value */};
@@ -318,7 +320,7 @@ TEST(VariantOpAddRegistryTest, TestDuplicate) {
   UnaryVariantOpRegistry registry;
   UnaryVariantOpRegistry::VariantBinaryOpFn f;
   class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
+  const auto kTypeIndex = TypeIndex::Make<FjFjFj>();
 
   registry.RegisterBinaryOpFn(ADD_VARIANT_BINARY_OP, DEVICE_CPU, kTypeIndex, f);
   EXPECT_DEATH(registry.RegisterBinaryOpFn(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 3aa9743353e..5edb6efdc5e 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -589,7 +589,7 @@ TEST(VariantTest, TensorListTest) {
   serialized.ToProto(&data);
   const Variant y_unknown = data;
   EXPECT_EQ(y_unknown.TypeName(), "TensorList");
-  EXPECT_EQ(y_unknown.TypeId(), MakeTypeIndex<VariantTensorDataProto>());
+  EXPECT_EQ(y_unknown.TypeId(), TypeIndex::Make<VariantTensorDataProto>());
   EXPECT_EQ(y_unknown.DebugString(),
             strings::StrCat(
                 "Variant<type: TensorList value: ", data.DebugString(), ">"));
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 02a26cdd390..257d77541e0 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -323,6 +323,7 @@ cc_library(
         ":cost_estimator",
         ":op_context",
         ":utils",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 6f57708a780..fb0d6ecf1d0 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 
+#include "absl/strings/match.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 
 namespace tensorflow {
@@ -101,16 +103,16 @@ static const Costs::Duration kMinComputeTime(1);
 
 namespace {
 
-string GetDataFormat(const OpInfo& op_info) {
-  string data_format = "NHWC";  // Default format.
+std::string GetDataFormat(const OpInfo& op_info) {
+  std::string data_format = "NHWC";  // Default format.
   if (op_info.attr().find("data_format") != op_info.attr().end()) {
     data_format = op_info.attr().at("data_format").s();
   }
   return data_format;
 }
 
-string GetFilterFormat(const OpInfo& op_info) {
-  string filter_format = "HWIO";  // Default format.
+std::string GetFilterFormat(const OpInfo& op_info) {
+  std::string filter_format = "HWIO";  // Default format.
   if (op_info.attr().find("filter_format") != op_info.attr().end()) {
     filter_format = op_info.attr().at("filter_format").s();
   }
@@ -202,7 +204,7 @@ int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
 
 // Helper function for determining whether there are repeated indices in the
 // input Einsum equation.
-bool CheckRepeatedDimensions(const string& dim_str) {
+bool CheckRepeatedDimensions(const absl::string_view dim_str) {
   int str_size = dim_str.size();
   for (int idx = 0; idx < str_size - 1; idx++) {
     if (dim_str.find(dim_str[idx], idx + 1) != std::string::npos) {
@@ -212,6 +214,75 @@ bool CheckRepeatedDimensions(const string& dim_str) {
   return false;
 }
 
+// Auxiliary function for determining whether OpLevelCostEstimator is compatible
+// with a given Einsum.
+bool IsEinsumCorrectlyFormed(const OpContext& einsum_context) {
+  const auto& op_info = einsum_context.op_info;
+
+  auto it = op_info.attr().find("equation");
+  if (it == op_info.attr().end()) return false;
+  const absl::string_view equation = it->second.s();
+  std::vector<std::string> equation_split = absl::StrSplit(equation, "->");
+
+  if (equation_split.empty()) {
+    LOG(WARNING) << "Einsum with malformed equation";
+    return false;
+  }
+  std::vector<absl::string_view> input_split =
+      absl::StrSplit(equation_split[0], ',');
+
+  // The current model covers Einsum operations with two operands and a RHS
+  if (op_info.inputs_size() != 2 || equation_split.size() != 2) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+    return false;
+  }
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
+  absl::string_view rhs_str = equation_split[1];
+  absl::string_view a_input_str = input_split[0];
+  absl::string_view b_input_str = input_split[1];
+
+  // Ellipsis are not currently supported
+  if (absl::StrContains(a_input_str, "...") ||
+      absl::StrContains(b_input_str, "...")) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", ellipsis not supported";
+    return false;
+  }
+
+  constexpr int kMatrixRank = 2;
+
+  bool a_input_shape_unknown = false;
+  bool b_input_shape_unknown = false;
+
+  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
+      a_input.shape(), std::max(kMatrixRank, a_input.shape().dim_size()),
+      &a_input_shape_unknown);
+  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
+      b_input.shape(), std::max(kMatrixRank, b_input.shape().dim_size()),
+      &b_input_shape_unknown);
+
+  if (a_input_str.size() != static_cast<size_t>(a_input_shape.dim_size()) ||
+      b_input_str.size() != static_cast<size_t>(b_input_shape.dim_size())) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", equation subscripts don't match tensor rank.";
+    return false;
+  }
+
+  // Subscripts where axis appears more than once for a single input are not yet
+  // supported
+  if (CheckRepeatedDimensions(a_input_str) ||
+      CheckRepeatedDimensions(b_input_str) ||
+      CheckRepeatedDimensions(rhs_str)) {
+    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
+            << ", Subscripts where axis appears more than once for a single "
+               "input are not yet supported";
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace
 
 // Return a minimum shape if the shape is unknown. If known, return the original
@@ -528,7 +599,7 @@ DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
       }
     }
   } else if (device.type() == "GPU") {
-    const string architecture = device.environment().at("architecture");
+    const std::string architecture = device.environment().at("architecture");
     int cores_per_multiprocessor;
     if (architecture < "3") {
       // Fermi
@@ -695,7 +766,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
 
   int x_index, y_index, major_channel_index, minor_channel_index = -1;
-  const string& data_format = GetDataFormat(op_info);
+  const std::string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     major_channel_index = 1;
     y_index = 2;
@@ -712,7 +783,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     x_index = 2;
     major_channel_index = 3;
   }
-  const string& filter_format = GetFilterFormat(op_info);
+  const std::string& filter_format = GetFilterFormat(op_info);
   int filter_x_index, filter_y_index, in_major_channel_index, out_channel_index,
       in_minor_channel_index = -1;
   if (filter_format == "HWIO") {
@@ -906,6 +977,130 @@ int64 OpLevelCostEstimator::CountMatMulOperations(const OpInfo& op_info,
   return ops;
 }
 
+bool OpLevelCostEstimator::GenerateBatchMatmulContextFromEinsum(
+    const OpContext& einsum_context, OpContext* batch_matmul_context,
+    bool* found_unknown_shapes) const {
+  // This auxiliary function transforms an einsum OpContext into its equivalent
+  // Batch Matmul OpContext. The function returns a boolean, which determines
+  // whether it was successful in generating the output OpContext or not.
+
+  // Einsum computes a generalized contraction between tensors of arbitrary
+  // dimension as defined by the equation written in the Einstein summation
+  // convention. The number of tensors in the computation and the number of
+  // contractions can be arbitrarily long. The current model only contemplates
+  // Einsum equations, which can be translated into a single BatchMatMul
+  // operation. Einsum operations with more than two operands are not currently
+  // supported. Subscripts where an axis appears more than once for a single
+  // input and ellipsis are currently also excluded. See:
+  // https://www.tensorflow.org/api_docs/python/tf/einsum
+  // We distinguish four kinds of dimensions, depending on their placement in
+  // the equation:
+  // + B: Batch dimensions: Dimensions which appear in both operands and RHS.
+  // + K: Contracting dimensions: These appear in both inputs but not RHS.
+  // + M: Operand A dimensions: These appear in the first operand and the RHS.
+  // + N: Operand B dimensions: These appear in the second operand and the RHS.
+  // Then, the operation to estimate is BatchMatMul([B,M,K],[B,K,N])
+
+  if (batch_matmul_context == nullptr) {
+    VLOG(1) << "Output context should not be a nullptr.";
+    return false;
+  }
+  if (!IsEinsumCorrectlyFormed(einsum_context)) return false;
+  const auto& op_info = einsum_context.op_info;
+  std::vector<std::string> equation_split =
+      absl::StrSplit(op_info.attr().find("equation")->second.s(), "->");
+  std::vector<absl::string_view> input_split =
+      absl::StrSplit(equation_split[0], ',');
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
+  absl::string_view rhs_str = equation_split[1];
+  absl::string_view a_input_str = input_split[0];
+  absl::string_view b_input_str = input_split[1];
+
+  constexpr int kMatrixRank = 2;
+
+  bool a_input_shape_unknown = false;
+  bool b_input_shape_unknown = false;
+
+  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
+      a_input.shape(), std::max(kMatrixRank, a_input.shape().dim_size()),
+      &a_input_shape_unknown);
+  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
+      b_input.shape(), std::max(kMatrixRank, b_input.shape().dim_size()),
+      &b_input_shape_unknown);
+
+  *found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
+                          (a_input.shape().dim_size() < kMatrixRank) ||
+                          (b_input.shape().dim_size() < kMatrixRank);
+
+  OpInfo batch_matmul_op_info = op_info;
+  batch_matmul_op_info.mutable_inputs()->Clear();
+  batch_matmul_op_info.set_op("BatchMatMul");
+
+  AttrValue transpose_attribute;
+  transpose_attribute.set_b(false);
+  (*batch_matmul_op_info.mutable_attr())["transpose_a"] = transpose_attribute;
+  (*batch_matmul_op_info.mutable_attr())["transpose_b"] = transpose_attribute;
+
+  OpInfo::TensorProperties* a_matrix = batch_matmul_op_info.add_inputs();
+  TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
+  a_matrix->set_dtype(a_input.dtype());
+
+  OpInfo::TensorProperties* b_matrix = batch_matmul_op_info.add_inputs();
+  b_matrix->set_dtype(b_input.dtype());
+  TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
+
+  TensorShapeProto_Dim m_dim;
+  TensorShapeProto_Dim n_dim;
+  TensorShapeProto_Dim k_dim;
+
+  m_dim.set_size(1);
+  n_dim.set_size(1);
+  k_dim.set_size(1);
+
+  for (int i_idx = 0, a_input_str_size = a_input_str.size();
+       i_idx < a_input_str_size; ++i_idx) {
+    if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
+      if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
+        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+        return false;
+      }
+
+      m_dim.set_size(m_dim.size() * a_input_shape.dim(i_idx).size());
+      continue;
+    } else if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
+      // The dimension does not appear in the RHS, therefore it is a contracting
+      // dimension.
+      k_dim.set_size(k_dim.size() * a_input_shape.dim(i_idx).size());
+      continue;
+    }
+    // It appears in both input operands, therefore we place it as an outer
+    // dimension for the Batch Matmul.
+    *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
+    *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
+  }
+  for (int i_idx = 0, b_input_str_size = b_input_str.size();
+       i_idx < b_input_str_size; ++i_idx) {
+    if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
+      if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
+        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+        return false;
+      }
+      n_dim.set_size(n_dim.size() * b_input_shape.dim(i_idx).size());
+    }
+  }
+
+  // The two inner-most dimensions of the Batch Matmul are added.
+  *(a_matrix_shape->add_dim()) = m_dim;
+  *(a_matrix_shape->add_dim()) = k_dim;
+  *(b_matrix_shape->add_dim()) = k_dim;
+  *(b_matrix_shape->add_dim()) = n_dim;
+
+  *batch_matmul_context = einsum_context;
+  batch_matmul_context->op_info = batch_matmul_op_info;
+  return true;
+}
+
 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     const OpInfo& op_info, bool* found_unknown_shapes) {
   return CountBatchMatMulOperations(op_info, nullptr, found_unknown_shapes);
@@ -1327,7 +1522,7 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
   // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
 
   // TODO(yaozhang): Support NHWC_VECT_W.
-  string data_format = GetDataFormat(op_context.op_info);
+  std::string data_format = GetDataFormat(op_context.op_info);
   if (data_format != "NCHW" && data_format != "NHWC" &&
       data_format != "NCHW_VECT_C") {
     LOG(WARNING) << "unsupported data format: " << data_format;
@@ -1335,7 +1530,7 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
     cost.inaccurate = true;
     return cost;
   }
-  string filter_format = GetFilterFormat(op_context.op_info);
+  std::string filter_format = GetFilterFormat(op_context.op_info);
   if (filter_format != "HWIO" && filter_format != "OIHW" &&
       filter_format != "OIHW_VECT_I") {
     LOG(WARNING) << "unsupported filter format: " << filter_format;
@@ -1405,154 +1600,17 @@ Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
 }
 
 Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
-  // Einsum computes a generalized contraction between tensors of arbitrary
-  // dimension as defined by the equation written in the Einstein summation
-  // convention. The number of tensors in the computation and the number of
-  // contractions can be arbitrarily long. The current model only contemplates
-  // Einsum equations, which can be translated into a single BatchMatMul
-  // operation. Einsum operations with more than two operands are not currently
-  // supported. Subscripts where an axis appears more than once for a single
-  // input and ellipsis are currently also excluded. See:
-  // https://www.tensorflow.org/api_docs/python/tf/einsum
-  // We distinguish four kinds of dimensions, depending on their placement in
-  // the equation:
-  // + B: Batch dimensions: Dimensions which appear in both operands and RHS.
-  // + K: Contracting dimensions: These appear in both inputs but not RHS.
-  // + M: Operand A dimensions: These appear in the first operand and the RHS.
-  // + N: Operand B dimensions: These appear in the second operand and the RHS.
-  // Then, the operation to estimate is BatchMatMul([B,M,K],[B,K,N])
   const auto& op_info = op_context.op_info;
 
   auto it = op_info.attr().find("equation");
   if (it == op_info.attr().end()) return Costs::ZeroCosts(/*inaccurate=*/true);
-  const string& equation = it->second.s();
-  std::vector<string> equation_split = absl::StrSplit(equation, "->");
-
-  if (equation_split.empty()) {
-    LOG(WARNING) << "Einsum with malformed equation";
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-  std::vector<string> input_split = absl::StrSplit(equation_split[0], ',');
-
-  // The current model covers Einsum operations with two operands and a RHS
-  if (op_info.inputs_size() != 2 || equation_split.size() != 2) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-  string rhs_str = equation_split[1];
-  string a_input_str = input_split[0];
-  string b_input_str = input_split[1];
-
-  // Ellipsis are not currently supported
-  if (a_input_str.find("...") != std::string::npos ||
-      b_input_str.find("...") != std::string::npos) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
-            << ", ellipsis not supported";
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-
-  const auto& a_input = op_info.inputs(0);
-  const auto& b_input = op_info.inputs(1);
-  const int matrix_rank = 2;
-
+  OpContext batch_matmul_op_context;
   bool found_unknown_shapes = false;
-  bool a_input_shape_unknown = false;
-  bool b_input_shape_unknown = false;
-
-  TensorShapeProto a_input_shape = MaybeGetMinimumShape(
-      a_input.shape(), std::max(matrix_rank, a_input.shape().dim_size()),
-      &a_input_shape_unknown);
-  TensorShapeProto b_input_shape = MaybeGetMinimumShape(
-      b_input.shape(), std::max(matrix_rank, b_input.shape().dim_size()),
-      &b_input_shape_unknown);
-
-  found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
-                         (a_input.shape().dim_size() < matrix_rank) ||
-                         (b_input.shape().dim_size() < matrix_rank);
-
-  if (a_input_str.size() != static_cast<size_t>(a_input_shape.dim_size()) ||
-      b_input_str.size() != static_cast<size_t>(b_input_shape.dim_size())) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
-            << ", equation subscripts don't match tensor rank.";
+  bool success = GenerateBatchMatmulContextFromEinsum(
+      op_context, &batch_matmul_op_context, &found_unknown_shapes);
+  if (!success) {
     return PredictCostOfAnUnknownOp(op_context);
   }
-
-  // Subscripts where axis appears more than once for a single input are not yet
-  // supported
-  if (CheckRepeatedDimensions(a_input_str) ||
-      CheckRepeatedDimensions(b_input_str) ||
-      CheckRepeatedDimensions(rhs_str)) {
-    VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
-            << ", Subscripts where axis appears more than once for a single "
-               "input are not yet supported";
-    return PredictCostOfAnUnknownOp(op_context);
-  }
-
-  OpInfo batch_matmul_op_info = op_info;
-  batch_matmul_op_info.mutable_inputs()->Clear();
-  batch_matmul_op_info.set_op("BatchMatMul");
-
-  AttrValue transpose_attribute;
-  transpose_attribute.set_b(false);
-  (*batch_matmul_op_info.mutable_attr())["transpose_a"] = transpose_attribute;
-  (*batch_matmul_op_info.mutable_attr())["transpose_b"] = transpose_attribute;
-
-  OpInfo::TensorProperties* a_matrix = batch_matmul_op_info.add_inputs();
-  TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
-  a_matrix->set_dtype(a_input.dtype());
-
-  OpInfo::TensorProperties* b_matrix = batch_matmul_op_info.add_inputs();
-  b_matrix->set_dtype(b_input.dtype());
-  TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
-
-  TensorShapeProto_Dim m_dim;
-  TensorShapeProto_Dim n_dim;
-  TensorShapeProto_Dim k_dim;
-
-  m_dim.set_size(1);
-  n_dim.set_size(1);
-  k_dim.set_size(1);
-
-  for (int i_idx = 0, a_input_str_size = a_input_str.size();
-       i_idx < a_input_str_size; ++i_idx) {
-    if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
-      if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
-        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
-        return PredictCostOfAnUnknownOp(op_context);
-      }
-
-      m_dim.set_size(m_dim.size() * a_input_shape.dim(i_idx).size());
-      continue;
-    } else if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
-      // The dimension does not appear in the RHS, therefore it is a contracting
-      // dimension.
-      k_dim.set_size(k_dim.size() * a_input_shape.dim(i_idx).size());
-      continue;
-    }
-    // It appears in both input operands, therefore we place it as an outer
-    // dimension for the Batch Matmul.
-    *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
-    *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
-  }
-  for (int i_idx = 0, b_input_str_size = b_input_str.size();
-       i_idx < b_input_str_size; ++i_idx) {
-    if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
-      if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
-        VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
-        return PredictCostOfAnUnknownOp(op_context);
-      }
-      n_dim.set_size(n_dim.size() * b_input_shape.dim(i_idx).size());
-    }
-  }
-
-  // The two inner-most dimensions of the Batch Matmul are added.
-  *(a_matrix_shape->add_dim()) = m_dim;
-  *(a_matrix_shape->add_dim()) = k_dim;
-  *(b_matrix_shape->add_dim()) = k_dim;
-  *(b_matrix_shape->add_dim()) = n_dim;
-
-  OpContext batch_matmul_op_context = op_context;
-  batch_matmul_op_context.op_info = batch_matmul_op_info;
   Costs costs = PredictCosts(batch_matmul_op_context);
   costs.inaccurate = costs.inaccurate || found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
@@ -1772,7 +1830,7 @@ Costs OpLevelCostEstimator::PredictFusedOp(
 
 /* static */
 OpContext OpLevelCostEstimator::FusedChildContext(
-    const OpContext& parent, const string& op_name,
+    const OpContext& parent, const std::string& op_name,
     const OpInfo::TensorProperties& output,
     const std::vector<OpInfo::TensorProperties>& inputs) {
   // Setup the base parameters of our new context.
@@ -1821,7 +1879,7 @@ OpLevelCostEstimator::OpDimensionsFromInputs(
   VLOG(2) << "Image shape: " << image_shape.DebugString();
 
   int x_index, y_index, channel_index;
-  const string& data_format = GetDataFormat(op_info);
+  const std::string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     channel_index = 1;
     y_index = 2;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index ad2df8fcdd5..2bf3c5bb920 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -138,6 +138,9 @@ class OpLevelCostEstimator {
   static int64 CountMatMulOperations(const OpInfo& op_info,
                                      MatMulDimensions* mat_mul,
                                      bool* found_unknown_shapes);
+  bool GenerateBatchMatmulContextFromEinsum(const OpContext& einsum_context,
+                                            OpContext* batch_matmul_context,
+                                            bool* found_unknown_shapes) const;
   static int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                           bool* found_unknown_shapes);
   static int64 CountBatchMatMulOperations(const OpInfo& op_info,
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e2ff5aed283..1e05ee90ff8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1762,6 +1762,7 @@ tf_cuda_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
+    tags = ["no_cuda11"],  # b/159664089
     deps = [
         ":conv_ops",
         ":image",
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index 6b6feb81cfa..debe2368d28 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -90,7 +90,7 @@ class ResourceConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
     h(1) = cinfo_.name();
     OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
                             ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<ConditionalAccumulatorBase>()));
+                            TypeIndex::Make<ConditionalAccumulatorBase>()));
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ResourceConditionalAccumulatorOp);
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index ac087360fd0..0127fe68641 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -35,7 +35,7 @@ Status CreateHandle(OpKernelContext* ctx, T* resource,
   TF_RETURN_IF_ERROR(mgr->Create<T>(container_name, unique_name, resource));
 
   *handle = MakeResourceHandle(container_name, unique_name, *ctx->device(),
-                               MakeTypeIndex<T>());
+                               TypeIndex::Make<T>());
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 65252e3dbcf..a9c682a426b 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -111,7 +111,7 @@ class ThreadPoolHandleOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
                             ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<ThreadPoolResource>()));
+                            TypeIndex::Make<ThreadPoolResource>()));
   }
 
  private:
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 8dd7f4c364b..1996e7f230e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -443,7 +443,7 @@ void IteratorHandleOp::Compute(OpKernelContext* context)
   }
   OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                               context, 0, cinfo_.container(), cinfo_.name(),
-                              MakeTypeIndex<IteratorResource>()));
+                              TypeIndex::Make<IteratorResource>()));
 }
 
 Status IteratorHandleOp::VerifyResource(IteratorResource* resource) {
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 7be03632d94..f3f67bcad07 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -475,7 +475,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
     }
     OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
                                 context, 0, container_name, unique_name,
-                                MakeTypeIndex<MultiDeviceIterator>()));
+                                TypeIndex::Make<MultiDeviceIterator>()));
   }
 
  private:
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index ea79a4b416b..93eee6ff350 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -126,7 +126,7 @@ class OpsTestBase : public ::testing::Test {
     std::string container_name =
         container.empty() ? rm->default_container() : container;
     EXPECT_TRUE(rm->Create(container_name, name, resource).ok());
-    AddResourceInputInternal(container_name, name, MakeTypeIndex<T>());
+    AddResourceInputInternal(container_name, name, TypeIndex::Make<T>());
   }
 
   // Runs an operation producing 'num_outputs' outputs.
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index e626d430864..f733d9b9aea 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -554,7 +554,7 @@ inline void TileGradientOp<Device, Tmultiples>::HandleCase(
     OpKernelContext* context, const std::vector<Tmultiples>& input_dims,
     const gtl::ArraySlice<Tmultiples>& multiples_array, Tensor* result) {
   LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: "
-             << MakeTypeIndex<Device>().name() << ", " << DataTypeString(DT)
+             << TypeIndex::Make<Device>().name() << ", " << DataTypeString(DT)
              << ", " << NDIM;
 }
 
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
new file mode 100644
index 00000000000..066ffd1091d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
@@ -0,0 +1,39 @@
+op {
+  name: "DecodeImage"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "expand_animations"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index dbd91c91b65..dec894cc173 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -11476,6 +11476,45 @@ op {
     type: DT_UINT8
   }
 }
+op {
+  name: "DecodeImage"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "expand_animations"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "DecodeJSONExample"
   input_arg {
diff --git a/tensorflow/core/platform/abi_test.cc b/tensorflow/core/platform/abi_test.cc
index 3a01953aec2..b6f8dd5c4ec 100644
--- a/tensorflow/core/platform/abi_test.cc
+++ b/tensorflow/core/platform/abi_test.cc
@@ -23,14 +23,14 @@ namespace tensorflow {
 struct MyRandomPODType {};
 
 TEST(AbiTest, AbiDemangleTest) {
-  EXPECT_EQ(port::MaybeAbiDemangle(MakeTypeIndex<int>().name()), "int");
+  EXPECT_EQ(port::MaybeAbiDemangle(TypeIndex::Make<int>().name()), "int");
 
 #ifdef PLATFORM_WINDOWS
   const char pod_type_name[] = "struct tensorflow::MyRandomPODType";
 #else
   const char pod_type_name[] = "tensorflow::MyRandomPODType";
 #endif
-  EXPECT_EQ(port::MaybeAbiDemangle(MakeTypeIndex<MyRandomPODType>().name()),
+  EXPECT_EQ(port::MaybeAbiDemangle(TypeIndex::Make<MyRandomPODType>().name()),
             pod_type_name);
 
   EXPECT_EQ(
diff --git a/tensorflow/core/platform/ctstring_internal.h b/tensorflow/core/platform/ctstring_internal.h
index 69338e6e4b7..f75fd04f955 100644
--- a/tensorflow/core/platform/ctstring_internal.h
+++ b/tensorflow/core/platform/ctstring_internal.h
@@ -136,7 +136,7 @@ typedef struct TF_TString {  // NOLINT
 // _Static_assert(CHAR_BIT == 8);
 // _Static_assert(sizeof(TF_TString) == 24);
 
-extern inline TF_TString_Type TF_TString_GetType(const TF_TString *str) {
+static inline TF_TString_Type TF_TString_GetType(const TF_TString *str) {
   return (TF_TString_Type)(str->u.raw.raw[0] & TF_TSTR_TYPE_MASK);  // NOLINT
 }
 
@@ -168,12 +168,12 @@ static inline size_t TF_TString_ToInternalSizeT(size_t size,
 #endif  // TF_TSTRING_LITTLE_ENDIAN
 }
 
-extern inline void TF_TString_Init(TF_TString *str) {
+static inline void TF_TString_Init(TF_TString *str) {
   str->u.smll.size = 0;
   str->u.smll.str[0] = '\0';
 }
 
-extern inline void TF_TString_Dealloc(TF_TString *str) {
+static inline void TF_TString_Dealloc(TF_TString *str) {
   if (TF_TString_GetType(str) == TF_TSTR_LARGE &&
       str->u.large.ptr != NULL) {  // NOLINT
     free(str->u.large.ptr);
@@ -181,7 +181,7 @@ extern inline void TF_TString_Dealloc(TF_TString *str) {
   }
 }
 
-extern inline size_t TF_TString_GetSize(const TF_TString *str) {
+static inline size_t TF_TString_GetSize(const TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return str->u.smll.size >> 2;
@@ -196,7 +196,7 @@ extern inline size_t TF_TString_GetSize(const TF_TString *str) {
   }
 }
 
-extern inline size_t TF_TString_GetCapacity(const TF_TString *str) {
+static inline size_t TF_TString_GetCapacity(const TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return TF_TString_SmallCapacity;
@@ -209,7 +209,7 @@ extern inline size_t TF_TString_GetCapacity(const TF_TString *str) {
   }
 }
 
-extern inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
+static inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return str->u.smll.str;
@@ -225,7 +225,7 @@ extern inline const char *TF_TString_GetDataPointer(const TF_TString *str) {
   }
 }
 
-extern inline char *TF_TString_ResizeUninitialized(TF_TString *str,
+static inline char *TF_TString_ResizeUninitialized(TF_TString *str,
                                                    size_t new_size) {
   size_t curr_size = TF_TString_GetSize(str);
   size_t copy_size = TF_min(new_size, curr_size);
@@ -288,7 +288,7 @@ extern inline char *TF_TString_ResizeUninitialized(TF_TString *str,
   return str->u.large.ptr;
 }
 
-extern inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
+static inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
   switch (TF_TString_GetType(str)) {
     case TF_TSTR_SMALL:
       return str->u.smll.str;
@@ -306,7 +306,7 @@ extern inline char *TF_TString_GetMutableDataPointer(TF_TString *str) {
   }
 }
 
-extern inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
+static inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
   TF_TString_Type curr_type = TF_TString_GetType(str);
 
   if (new_cap <= TF_TString_SmallCapacity) {
@@ -347,7 +347,7 @@ extern inline void TF_TString_Reserve(TF_TString *str, size_t new_cap) {
   str->u.large.cap = new_cap;
 }
 
-extern inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
+static inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
                                       char c) {
   size_t curr_size = TF_TString_GetSize(str);
   char *cstr = TF_TString_ResizeUninitialized(str, new_size);
@@ -359,7 +359,7 @@ extern inline char *TF_TString_Resize(TF_TString *str, size_t new_size,
   return cstr;
 }
 
-extern inline void TF_TString_AssignView(TF_TString *dst, const char *src,
+static inline void TF_TString_AssignView(TF_TString *dst, const char *src,
                                          size_t size) {
   TF_TString_Dealloc(dst);
 
@@ -367,7 +367,7 @@ extern inline void TF_TString_AssignView(TF_TString *dst, const char *src,
   dst->u.view.ptr = src;
 }
 
-extern inline void TF_TString_AppendN(TF_TString *dst, const char *src,
+static inline void TF_TString_AppendN(TF_TString *dst, const char *src,
                                       size_t src_size) {
   if (!src_size) return;
 
@@ -378,21 +378,21 @@ extern inline void TF_TString_AppendN(TF_TString *dst, const char *src,
   memcpy(dst_c + dst_size, src, src_size);
 }
 
-extern inline void TF_TString_Append(TF_TString *dst, const TF_TString *src) {
+static inline void TF_TString_Append(TF_TString *dst, const TF_TString *src) {
   const char *src_c = TF_TString_GetDataPointer(src);
   size_t size = TF_TString_GetSize(src);
 
   TF_TString_AppendN(dst, src_c, size);
 }
 
-extern inline void TF_TString_Copy(TF_TString *dst, const char *src,
+static inline void TF_TString_Copy(TF_TString *dst, const char *src,
                                    size_t size) {
   char *dst_c = TF_TString_ResizeUninitialized(dst, size);
 
   if (size) memcpy(dst_c, src, size);
 }
 
-extern inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
+static inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
   if (dst == src) return;
 
   TF_TString_Dealloc(dst);
@@ -421,7 +421,7 @@ extern inline void TF_TString_Assign(TF_TString *dst, const TF_TString *src) {
   }
 }
 
-extern inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
+static inline void TF_TString_Move(TF_TString *dst, TF_TString *src) {
   if (dst == src) return;
 
   TF_TString_Dealloc(dst);
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 931801427e7..b620b51cc99 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -1518,6 +1518,7 @@ Status CuptiTracer::DisableActivityTracing() {
 
 Status CuptiTracer::Finalize() {
   if (option_->cupti_finalize) {
+    VLOG(1) << "CuptiFinalize";
     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
   }
   return Status::OK();
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 3c0ac04caf2..48391324f79 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -612,8 +612,11 @@ Status GpuTracer::DoStart() {
   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD);
 
+// CUDA/CUPTI 10 have issues (leaks and crashes) with CuptiFinalize.
 #if CUDA_VERSION < 10000
-  if (!trace_concurrent_kernels) options_.cupti_finalize = true;
+  if (!options.trace_concurrent_kernels()) options_.cupti_finalize = true;
+#elif CUDA_VERSION >= 11000
+  options_.cupti_finalize = true;
 #endif
 
   CuptiTracerCollectorOptions collector_options;
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index aa811f23672..589af63da52 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -141,7 +141,6 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
-        "//tensorflow/stream_executor/tpu:tpu_platform_hdrs",
     ],
 )
 
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index f69c97b81de..d82cf1b254b 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -1,4 +1,5 @@
 # TPU Kernel Implementations
+
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library_cc",
@@ -86,8 +87,8 @@ cc_library(
     hdrs = ["tpu_compile_c_api.h"],
     deps = [
         ":tpu_mesh_state_c_api_hdrs",
-        ":tpu_ops_common_c_api_hdrs",
         ":tpu_program_c_api_hdrs",
+        ":tpu_util_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
@@ -367,7 +368,6 @@ cc_library(
 
 cc_library(
     name = "tpu_util_hdrs",
-    srcs = [],
     hdrs = ["tpu_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
@@ -390,17 +390,11 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "tpu_ops_common_c_api_hdrs",
-    hdrs = ["tpu_ops_common_c_api.h"],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_program_c_api_hdrs",
     hdrs = ["tpu_program_c_api.h"],
     deps = [
-        ":tpu_ops_common_c_api_hdrs",
+        ":tpu_util_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = True,
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index eab53fe9da4..e82df78b3bd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
 
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
diff --git a/tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h b/tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h
deleted file mode 100644
index 987eb64925f..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OPS_COMMON_C_API_H_
-#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OPS_COMMON_C_API_H_
-
-typedef struct SE_Status SE_Status;
-
-#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OPS_COMMON_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_program_c_api.h b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
index 43cbe37d258..254527e7a2a 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_C_API_H_
 
-#include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct XLA_TpuProgram XLA_TpuProgram;
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index c6666421327..495e6a2219b 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
-#include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #define TFTPU_SET_FN(Struct, FnName)                                       \
   Struct->FnName##Fn =                                                     \
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index e21d7f195ad..29fdb42d95e 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -137,6 +137,7 @@ tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
 
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_Create);
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_Free);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Initialize);
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_StopChipHeartbeats);
   TFTPU_SET_FN(node_context_fn, TpuNodeContext_CloseTpuHost);
 
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3675c26751c..106e7445be9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -15370,6 +15370,80 @@ func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	return op.Output(0)
 }
 
+// DecodeImageAttr is an optional argument to DecodeImage.
+type DecodeImageAttr func(optionalAttr)
+
+// DecodeImageChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeImageChannels(value int64) DecodeImageAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeImageDtype sets the optional dtype attribute to value.
+//
+// value: The desired DType of the returned Tensor.
+// If not specified, defaults to DT_UINT8
+func DecodeImageDtype(value tf.DataType) DecodeImageAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// DecodeImageExpandAnimations sets the optional expand_animations attribute to value.
+//
+// value: Controls the output shape of the returned op. If True, the returned op will
+// produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all
+// GIFs, whether animated or not. If, False, the returned op will produce a 3-D
+// tensor for all file types and will truncate animated GIFs to the first frame.
+// If not specified, defaults to true
+func DecodeImageExpandAnimations(value bool) DecodeImageAttr {
+	return func(m optionalAttr) {
+		m["expand_animations"] = value
+	}
+}
+
+// Function for decode_bmp, decode_gif, decode_jpeg, and decode_png.
+//
+// Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+// appropriate operation to convert the input bytes string into a Tensor of type
+// dtype.
+//
+// *NOTE*: decode_gif returns a 4-D array [num_frames, height, width, 3], as
+// opposed to decode_bmp, decode_jpeg and decode_png, which return 3-D arrays
+// [height, width, num_channels]. Make sure to take this into account when
+// constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
+// PNG files. Alternately, set the expand_animations argument of this function to
+// False, in which case the op will return 3-dimensional tensors and will truncate
+// animated GIF files to the first frame.
+//
+// Arguments:
+//	contents: 0-D. The encoded image bytes.
+//
+// Returns 3-D with shape `[height, width, channels]` or 4-D with shape
+// `[frame, height, width, channels]`..
+func DecodeImage(scope *Scope, contents tf.Output, optional ...DecodeImageAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeImage",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPoolAttr is an optional argument to AvgPool.
 type AvgPoolAttr func(optionalAttr)
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 4460a400f23..a2b6c6de06c 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -177,6 +177,92 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
   }
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseAbs(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteAddParams, SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteAddParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const AddOptions* schema_params = op->builtin_options_as_AddOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+    params->pot_scale_int16 = schema_params->pot_scale_int16();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteArgMaxParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteArgMaxParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ArgMaxOptions* schema_params = op->builtin_options_as_ArgMaxOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(
+        schema_params->output_type(), &params->output_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteArgMinParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteArgMinParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ArgMinOptions* schema_params = op->builtin_options_as_ArgMinOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(
+        schema_params->output_type(), &params->output_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -430,6 +516,22 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
   SafeBuiltinDataAllocator safe_allocator(allocator);
   *builtin_data = nullptr;
   switch (op_type) {
+    case BuiltinOperator_ABS: {
+      return ParseAbs(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ADD: {
+      return ParseAdd(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ARG_MAX: {
+      return ParseArgMax(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ARG_MIN: {
+      return ParseArgMin(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CONV_2D: {
       return ParseConv2D(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -586,17 +688,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_ADD: {
-      auto params = safe_allocator.Allocate<TfLiteAddParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_AddOptions()) {
-        params->activation =
-            ConvertActivation(schema_params->fused_activation_function());
-        params->pot_scale_int16 = schema_params->pot_scale_int16();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_DIV: {
       auto params = safe_allocator.Allocate<TfLiteDivParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -840,28 +931,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_ARG_MAX: {
-      auto params = safe_allocator.Allocate<TfLiteArgMaxParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
-                                                &params->output_type,
-                                                error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
-    case BuiltinOperator_ARG_MIN: {
-      auto params = safe_allocator.Allocate<TfLiteArgMinParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ArgMinOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
-                                                &params->output_type,
-                                                error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_TRANSPOSE_CONV: {
       auto params = safe_allocator.Allocate<TfLiteTransposeConvParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1021,7 +1090,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return kTfLiteOk;
     }
     // Below are the ops with no builtin_data structure.
-    case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 78d2aca6222..a6431aa5ee1 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -75,6 +75,22 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
 // removed once we are no longer using ParseOpData for the OpResolver
 // implementation in micro.
 
+TfLiteStatus ParseAbs(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
index d65ff071c7e..020a99852d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.cc
@@ -27,21 +27,19 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def, const LinearStorage& biases, int src_depth,
-    int dst_depth, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
+                                              int src_depth, int dst_depth,
+                                              const CLDevice& device,
+                                              Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
 
   switch (op_def.precision) {
@@ -61,23 +59,19 @@ std::string GenerateConvolutionTransposedCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __constant FLT4* filters,  \n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= src_size.x || Y >= src_size.y) return;\n";
+  c += "  if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
+       "return;\n";
   for (int d = 0; d < dst_depth; ++d) {
     const std::string layer = std::to_string(d);
     c += "  ACCUM_FLT4 r" + layer + "[2][2];\n";
@@ -91,61 +85,48 @@ std::string GenerateConvolutionTransposedCode(
     const std::string z = std::to_string(s);
     c += "  {\n";
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  bool x_in = X + 1 < src_size.x;\n";
-      c += "  bool y_in = Y + 1 < src_size.y;\n";
-      c +=
-          "  FLT4 src0 = " + src_tensor.ReadWHSB("X", "Y", z, batch_id) + ";\n";
+      c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
+      c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
+      c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
       c += "  FLT4 src1 = (FLT4)(0.0);\n";
       c += "  FLT4 src2 = (FLT4)(0.0);\n";
       c += "  FLT4 src3 = (FLT4)(0.0);\n";
       c += "  if (x_in) {\n";
-      c += "    src1 = " + src_tensor.ReadWHSB("X + 1", "Y", z, batch_id) +
-           ";\n";
+      c += "    src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
       c += "  }\n";
       c += "  if (y_in) {\n";
-      c += "    src2 = " + src_tensor.ReadWHSB("X", "Y + 1", z, batch_id) +
-           ";\n";
+      c += "    src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
       c += "  }\n";
       c += "  if (x_in && y_in) {\n";
-      c += "    src3 = " + src_tensor.ReadWHSB("X + 1", "Y + 1", z, batch_id) +
-           ";\n";
+      c += "    src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
       c += "  }\n";
     } else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      c +=
-          "  " + src_tensor.GetAddressWHSB("c0", "X", "Y", z, batch_id) + ";\n";
-      c += "  " + src_tensor.GetAddressWHSB("c1", "X + 1", "Y", z, batch_id) +
-           ";\n";
-      c += "  " + src_tensor.GetAddressWHSB("c2", "X", "Y + 1", z, batch_id) +
-           ";\n";
-      c += "  " +
-           src_tensor.GetAddressWHSB("c3", "X + 1", "Y + 1", z, batch_id) +
-           ";\n";
-      c += "  bool x_in = X + 1 < src_size.x;\n";
-      c += "  bool y_in = Y + 1 < src_size.y;\n";
+      c += "  args.src_tensor.GetAddress(c0, X, Y, " + z + ");\n";
+      c += "  args.src_tensor.GetAddress(c1, X + 1, Y, " + z + ");\n";
+      c += "  args.src_tensor.GetAddress(c2, X, Y + 1, " + z + ");\n";
+      c += "  args.src_tensor.GetAddress(c3, X + 1, Y + 1, " + z + ");\n";
+      c += "  bool x_in = X + 1 < args.src_tensor.Width();\n";
+      c += "  bool y_in = Y + 1 < args.src_tensor.Height();\n";
       c += "  c1 = select(-1, c1, x_in);\n";
       c += "  c2 = select(-1, c2, y_in);\n";
       c += "  c3 = select(-1, c3, x_in && y_in);\n";
-      c += "  FLT4 src0 = " + src_tensor.Read("c0") + ";\n";
-      c += "  FLT4 src1 = " + src_tensor.Read("c1") + ";\n";
-      c += "  FLT4 src2 = " + src_tensor.Read("c2") + ";\n";
-      c += "  FLT4 src3 = " + src_tensor.Read("c3") + ";\n";
+      c += "  FLT4 src0 = args.src_tensor.Read(c0);\n";
+      c += "  FLT4 src1 = args.src_tensor.Read(c1);\n";
+      c += "  FLT4 src2 = args.src_tensor.Read(c2);\n";
+      c += "  FLT4 src3 = args.src_tensor.Read(c3);\n";
     } else {
-      const auto mode = GetFastestZeroMode(device);
-      c += "  FLT4 src0 = " + src_tensor.ReadWHSB("X", "Y", z, batch_id, mode) +
-           ";\n";
-      c += "  FLT4 src1 = " +
-           src_tensor.ReadWHSB("X + 1", "Y", z, batch_id, mode) + ";\n";
-      c += "  FLT4 src2 = " +
-           src_tensor.ReadWHSB("X", "Y + 1", z, batch_id, mode) + ";\n";
-      c += "  FLT4 src3 = " +
-           src_tensor.ReadWHSB("X + 1", "Y + 1", z, batch_id, mode) + ";\n";
+      c += "  FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
+      c += "  FLT4 src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
+      c += "  FLT4 src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
+      c += "  FLT4 src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
     }
     for (int d = 0; d < dst_depth; ++d) {
       const std::string layer = std::to_string(d);
       const std::string f_offset = std::to_string(filters_index);
       filters_index++;
       c += "  {\n";
-      c += "  __constant FLT4* L0 = filters + 36 * " + f_offset + ";\n";
+      c += "  __constant FLT4* L0 = args.weights.GetPtr() + 36 * " + f_offset +
+           ";\n";
       c += "  CONV(r" + layer + "[0][0], src0, L0, 0);\n";
       c += "  CONV(r" + layer + "[0][1], src0, L0, 4);\n";
       c += "  CONV(r" + layer + "[0][1], src1, L0, 8);\n";
@@ -164,7 +145,8 @@ std::string GenerateConvolutionTransposedCode(
   for (int d = 0; d < dst_depth; ++d) {
     const std::string layer = std::to_string(d);
     c += "  {\n";
-    c += "  FLT4 bias_val = " + biases.ReadLinearFLT4(layer) + ";\n";
+    c += "  FLT4 bias_val = args.weights.Read(" +
+         std::to_string(36 * filters_index + d) + ");\n";
     for (int y = 0; y < 2; ++y) {
       for (int x = 0; x < 2; ++x) {
         const std::string x_coord = "X + " + std::to_string(x);
@@ -172,14 +154,8 @@ std::string GenerateConvolutionTransposedCode(
         c += "  {\n";
         c += "    FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
              "][" + std::to_string(x) + "]) + bias_val;\n";
-        const std::string x_3dcoord = op_def.IsBatchSupported()
-                                          ? "(" + x_coord + ") * dst_size.w + B"
-                                          : x_coord;
-        const LinkingContext context{"result", x_3dcoord, y_coord, layer};
-        c += PostProcess(linked_operations, context);
-        c += "    " +
-             dst_tensor.WriteWHSB("result", x_coord, y_coord, layer, batch_id) +
-             "\n";
+        c += "    args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
+             ", " + layer + ");\n";
         c += "  }\n";
       }
     }
@@ -200,8 +176,6 @@ ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
     ConvolutionTransposed3x3Thin&& operation)
     : GPUOperation(std::move(operation)),
-      weights_(std::move(operation.weights_)),
-      biases_(std::move(operation.biases_)),
       src_channels_(operation.src_channels_),
       dst_channels_(operation.dst_channels_),
       kernel_(std::move(operation.kernel_)),
@@ -210,8 +184,6 @@ ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
 ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
     ConvolutionTransposed3x3Thin&& operation) {
   if (this != &operation) {
-    weights_ = std::move(operation.weights_);
-    biases_ = std::move(operation.biases_);
     std::swap(src_channels_, operation.src_channels_);
     std::swap(dst_channels_, operation.dst_channels_);
     kernel_ = std::move(operation.kernel_);
@@ -223,25 +195,25 @@ ConvolutionTransposed3x3Thin& ConvolutionTransposed3x3Thin::operator=(
 
 absl::Status ConvolutionTransposed3x3Thin::Compile(
     const CreationContext& creation_context) {
-  const auto code = GenerateConvolutionTransposedCode(
-      definition_, biases_, DivideRoundUp(src_channels_, 4),
-      DivideRoundUp(dst_channels_, 4), *creation_context.device,
-      linked_operations_);
+  std::string code = GenerateConvolutionTransposedCode(
+      definition_, DivideRoundUp(src_channels_, 4),
+      DivideRoundUp(dst_channels_, 4), *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConvolutionTransposed3x3Thin::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
@@ -282,15 +254,7 @@ absl::Status CreateConvolutionTransposed3x3Thin(
   }
   *result = ConvolutionTransposed3x3Thin(definition, attr);
   RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+      result->UploadData(attr.weights, attr.bias, creation_context.context));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
index 447afb621e2..e292f416796 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin.h
@@ -59,8 +59,9 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
       const OperationDef& definition,
       const ConvolutionTransposedAttributes& attr);
   template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             CLContext* context);
+  absl::Status UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
+                          const tflite::gpu::Tensor<Linear, T>& biases,
+                          CLContext* context);
 
   template <DataType S, typename T>
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
@@ -69,9 +70,6 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
-  LinearStorage biases_;
-
   int src_channels_;
   int dst_channels_;
 
@@ -80,29 +78,58 @@ class ConvolutionTransposed3x3Thin : public GPUOperation {
 };
 
 template <DataType T>
-absl::Status ConvolutionTransposed3x3Thin::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
+absl::Status ConvolutionTransposed3x3Thin::UploadData(
+    const tflite::gpu::Tensor<OHWI, T>& weights,
+    const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   const int src_depth = DivideRoundUp(src_channels_, 4);
   const int dst_depth = DivideRoundUp(dst_channels_, 4);
   const int kernel_x = 3;  //  This operation support only 3x3 kernel
   const int kernel_y = 3;
   const int flt4_count = kernel_x * kernel_y * src_depth * dst_depth * 4;
 
-  const int flt4_size = definition_.precision == CalculationsPrecision::F32
-                            ? sizeof(float4)
-                            : sizeof(half4);
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+
+  Buffer weights_buffer;
+  if (f32_weights) {
     std::vector<float4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    for (int i = 0; i < dst_depth; ++i) {
+      float4 bias_value(0.0f);
+      for (int c = 0; c < 4; ++c) {
+        int ch = i * 4 + c;
+        bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
+      }
+      gpu_data.push_back(bias_value);
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(float4) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
     std::vector<half4> gpu_data(flt4_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(flt4_size * flt4_count, gpu_data.data(),
-                                context, &weights_);
+    for (int i = 0; i < dst_depth; ++i) {
+      half4 bias_value(0.0f);
+      for (int c = 0; c < 4; ++c) {
+        int ch = i * 4 + c;
+        bias_value[c] = ch < weights.shape.o ? biases.data[ch] : 0.0f;
+      }
+      gpu_data.push_back(bias_value);
+    }
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(sizeof(half4) * gpu_data.size(),
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
index 1d25605582a..82d4492866d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3_thin_test.cc
@@ -43,7 +43,7 @@ TEST_F(OpenCLOperationTest, ConvolutionTransposed3x3ThinSimpleWeights) {
   attr.weights.shape = OHWI(1, 3, 3, 1);
   attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
   attr.bias.shape = Linear(2);
-  attr.bias.data = {0.0f};
+  attr.bias.data = {0.0f, 0.0f};
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
index c8ac82581c0..309ce4a9d87 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.cc
@@ -28,55 +28,47 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateDepthwiseConvCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const CLDevice& device, bool weights_are_buffer, bool local_mem_uploads) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateDepthwiseConvCode(const OperationDef& op_def,
+                                      const CLDevice& device,
+                                      bool weights_are_buffer,
+                                      bool local_mem_uploads, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
-  const auto mode = GetFastestZeroMode(device);
-
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
                             src_tensor_type == TensorStorageType::IMAGE_BUFFER;
 
+  std::string c = GetCommonDefines(op_def.precision);
   if (local_mem_uploads) {
     c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
   }
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT4* filters\n";
-  } else {
-    c += "    __read_only image2d_t filters\n";
-  }
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 dst_size\n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0) * 2;\n";
   c += "  int Y = get_global_id(1) * 2;\n";
-  c += "  int Z = get_global_id(2);\n";
+  c += "  int S = get_global_id(2);\n";
   c += "   ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
   c += "   ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
   c += "   ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
   c += "   ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
   if (!local_mem_uploads) {
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| S >= args.dst_tensor.Slices()) { \n";
+    c += "    return; \n";
+    c += "  } \n";
   }
   if (local_mem_uploads) {
     c += "  __local FLT4 f[10];\n";
-    c += "  event_t e = async_work_group_copy(f, filters + Z * 10, 10, 0);\n";
+    c += "  event_t e = async_work_group_copy(f, args.weights.GetPtr() + S * "
+         "10, 10, 0);\n";
     c += "  wait_group_events(1, &e);\n";
   } else if (weights_are_buffer) {
-    c += "  __global FLT4* f = filters + Z * 10;\n";
+    c += "  __global FLT4* f = args.weights.GetPtr() + S * 10;\n";
   }
   c += "  FLT4 s0;\n";
   c += "  FLT4 s1;\n";
@@ -87,15 +79,15 @@ std::string GenerateDepthwiseConvCode(
   std::string xc[4] = {"X - 1", "X", "X + 1", "X + 2"};
   std::string yc[4] = {"Y - 1", "Y", "Y + 1", "Y + 2"};
   if (!weights_are_buffer) {
-    c += "   FLT4 f0 = READ_IMAGE(filters, smp_none, (int2)(0, Z));\n";
-    c += "   FLT4 f1 = READ_IMAGE(filters, smp_none, (int2)(1, Z));\n";
-    c += "   FLT4 f2 = READ_IMAGE(filters, smp_none, (int2)(2, Z));\n";
-    c += "   FLT4 f3 = READ_IMAGE(filters, smp_none, (int2)(3, Z));\n";
-    c += "   FLT4 f4 = READ_IMAGE(filters, smp_none, (int2)(4, Z));\n";
-    c += "   FLT4 f5 = READ_IMAGE(filters, smp_none, (int2)(5, Z));\n";
-    c += "   FLT4 f6 = READ_IMAGE(filters, smp_none, (int2)(6, Z));\n";
-    c += "   FLT4 f7 = READ_IMAGE(filters, smp_none, (int2)(7, Z));\n";
-    c += "   FLT4 f8 = READ_IMAGE(filters, smp_none, (int2)(8, Z));\n";
+    c += "   FLT4 f0 = args.weights.Read(0, S);\n";
+    c += "   FLT4 f1 = args.weights.Read(1, S);\n";
+    c += "   FLT4 f2 = args.weights.Read(2, S);\n";
+    c += "   FLT4 f3 = args.weights.Read(3, S);\n";
+    c += "   FLT4 f4 = args.weights.Read(4, S);\n";
+    c += "   FLT4 f5 = args.weights.Read(5, S);\n";
+    c += "   FLT4 f6 = args.weights.Read(6, S);\n";
+    c += "   FLT4 f7 = args.weights.Read(7, S);\n";
+    c += "   FLT4 f8 = args.weights.Read(8, S);\n";
   }
   if (manual_clamp) {
     c += "  int x0 = X - 1;\n";
@@ -106,25 +98,25 @@ std::string GenerateDepthwiseConvCode(
     c += "  int y1 = Y;\n";
     c += "  int y2 = Y + 1;\n";
     c += "  int y3 = Y + 2;\n";
-    c += "  bool x0_in = x0 >= 0 && x0 < dst_size.x;\n";
-    c += "  bool x1_in = x1 >= 0 && x1 < dst_size.x;\n";
-    c += "  bool x2_in = x2 >= 0 && x2 < dst_size.x;\n";
-    c += "  bool x3_in = x3 >= 0 && x3 < dst_size.x;\n";
-    c += "  bool y0_in = y0 >= 0 && y0 < dst_size.y;\n";
-    c += "  bool y1_in = y1 >= 0 && y1 < dst_size.y;\n";
-    c += "  bool y2_in = y2 >= 0 && y2 < dst_size.y;\n";
-    c += "  bool y3_in = y3 >= 0 && y3 < dst_size.y;\n";
-    c += "  x0 = clamp(x0, 0, dst_size.x - 1);\n";
-    c += "  x1 = clamp(x1, 0, dst_size.x - 1);\n";
-    c += "  x2 = clamp(x2, 0, dst_size.x - 1);\n";
-    c += "  x3 = clamp(x3, 0, dst_size.x - 1);\n";
-    c += "  y0 = clamp(y0, 0, dst_size.y - 1);\n";
-    c += "  y1 = clamp(y1, 0, dst_size.y - 1);\n";
-    c += "  y2 = clamp(y2, 0, dst_size.y - 1);\n";
-    c += "  y3 = clamp(y3, 0, dst_size.y - 1);\n";
+    c += "  bool x0_in = x0 >= 0 && x0 < args.dst_tensor.Width();\n";
+    c += "  bool x1_in = x1 >= 0 && x1 < args.dst_tensor.Width();\n";
+    c += "  bool x2_in = x2 >= 0 && x2 < args.dst_tensor.Width();\n";
+    c += "  bool x3_in = x3 >= 0 && x3 < args.dst_tensor.Width();\n";
+    c += "  bool y0_in = y0 >= 0 && y0 < args.dst_tensor.Height();\n";
+    c += "  bool y1_in = y1 >= 0 && y1 < args.dst_tensor.Height();\n";
+    c += "  bool y2_in = y2 >= 0 && y2 < args.dst_tensor.Height();\n";
+    c += "  bool y3_in = y3 >= 0 && y3 < args.dst_tensor.Height();\n";
+    c += "  x0 = clamp(x0, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  x1 = clamp(x1, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  x2 = clamp(x2, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  x3 = clamp(x3, 0, args.dst_tensor.Width() - 1);\n";
+    c += "  y0 = clamp(y0, 0, args.dst_tensor.Height() - 1);\n";
+    c += "  y1 = clamp(y1, 0, args.dst_tensor.Height() - 1);\n";
+    c += "  y2 = clamp(y2, 0, args.dst_tensor.Height() - 1);\n";
+    c += "  y3 = clamp(y3, 0, args.dst_tensor.Height() - 1);\n";
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  __global FLT4* src_loc = src_data + Z * dst_size.x * "
-           "dst_size.y;\n";
+      c += "  __global FLT4* src_loc = "
+           "args.src_tensor.GetPtrWithSliceOffset(S);\n";
     }
     xc[0] = "x0";
     xc[1] = "x1";
@@ -150,29 +142,29 @@ std::string GenerateDepthwiseConvCode(
   auto read_4x_line = [&](int y) {
     if (src_tensor_type == TensorStorageType::BUFFER) {
       const std::string y_in = "y" + std::to_string(y) + "_in";
-      c += "    s0 = src_loc[" + yc[y] + " * dst_size.x + " + xc[0] +
-           "] * (FLT)(x0_in && " + y_in + ");\n";
-      c += "    s1 = src_loc[" + yc[y] + " * dst_size.x + " + xc[1] +
-           "] * (FLT)(x1_in && " + y_in + ");\n";
-      c += "    s2 = src_loc[" + yc[y] + " * dst_size.x + " + xc[2] +
-           "] * (FLT)(x2_in && " + y_in + ");\n";
-      c += "    s3 = src_loc[" + yc[y] + " * dst_size.x + " + xc[3] +
-           "] * (FLT)(x3_in && " + y_in + ");\n";
+      c += "    s0 = src_loc[args.src_tensor.GetWHOffset(" + xc[0] + ", " +
+           yc[y] + ")] * (FLT)(x0_in && " + y_in + ");\n";
+      c += "    s1 = src_loc[args.src_tensor.GetWHOffset(" + xc[1] + ", " +
+           yc[y] + ")] * (FLT)(x1_in && " + y_in + ");\n";
+      c += "    s2 = src_loc[args.src_tensor.GetWHOffset(" + xc[2] + ", " +
+           yc[y] + ")] * (FLT)(x2_in && " + y_in + ");\n";
+      c += "    s3 = src_loc[args.src_tensor.GetWHOffset(" + xc[3] + ", " +
+           yc[y] + ")] * (FLT)(x3_in && " + y_in + ");\n";
     } else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
       const std::string y_in = "y" + std::to_string(y) + "_in";
-      c += "    s0 = " + src_tensor.ReadWHS(xc[0], yc[y], "Z", mode) +
-           " * (FLT)(x0_in && " + y_in + ");\n";
-      c += "    s1 = " + src_tensor.ReadWHS(xc[1], yc[y], "Z", mode) +
-           " * (FLT)(x1_in && " + y_in + ");\n";
-      c += "    s2 = " + src_tensor.ReadWHS(xc[2], yc[y], "Z", mode) +
-           " * (FLT)(x2_in && " + y_in + ");\n";
-      c += "    s3 = " + src_tensor.ReadWHS(xc[3], yc[y], "Z", mode) +
-           " * (FLT)(x3_in && " + y_in + ");\n";
+      c += "    s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] +
+           ", S) * (FLT)(x0_in && " + y_in + ");\n";
+      c += "    s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] +
+           ", S) * (FLT)(x1_in && " + y_in + ");\n";
+      c += "    s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] +
+           ", S) * (FLT)(x2_in && " + y_in + ");\n";
+      c += "    s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] +
+           ", S) * (FLT)(x3_in && " + y_in + ");\n";
     } else {
-      c += "    s0 = " + src_tensor.ReadWHS(xc[0], yc[y], "Z", mode) + ";\n";
-      c += "    s1 = " + src_tensor.ReadWHS(xc[1], yc[y], "Z", mode) + ";\n";
-      c += "    s2 = " + src_tensor.ReadWHS(xc[2], yc[y], "Z", mode) + ";\n";
-      c += "    s3 = " + src_tensor.ReadWHS(xc[3], yc[y], "Z", mode) + ";\n";
+      c += "    s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] + ", S);\n";
+      c += "    s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] + ", S);\n";
+      c += "    s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] + ", S);\n";
+      c += "    s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] + ", S);\n";
     }
   };
   c += "  {\n";
@@ -224,40 +216,38 @@ std::string GenerateDepthwiseConvCode(
   c += "    r3 += TO_ACCUM_TYPE(" + W[8] + " * s3);\n";
   c += "  }\n";
   if (!weights_are_buffer) {
-    c += "   FLT4 bias = READ_IMAGE(filters, smp_none, (int2)(9, Z));\n";
+    c += "   FLT4 bias = args.weights.Read(9, S);\n";
   }
   c += "  r0 += TO_ACCUM_TYPE(" + bias + ");\n";
   c += "  r1 += TO_ACCUM_TYPE(" + bias + ");\n";
   c += "  r2 += TO_ACCUM_TYPE(" + bias + ");\n";
   c += "  r3 += TO_ACCUM_TYPE(" + bias + ");\n";
   if (local_mem_uploads) {
-    c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) "
-         "return;\n";
+    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+         "|| "
+         "S >= args.dst_tensor.Slices()) { \n";
+    c += "    return; \n";
+    c += "  } \n";
   }
-  c += "  if(X + 0 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "  if(X + 0 < args.dst_tensor.Width() && Y + 0 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r0);\n";
-  c += "  " + dst_tensor.GetAddressWHS("address", "X + 0", "Y + 0", "Z") + "\n";
-  LinkingContext context{"result", "X + 0", "Y + 0", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 0", "Y + 0", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 0, Y + 0, S)\n";
   c += "  }\n";
-  c += "  if(X + 1 < dst_size.x && Y + 0 < dst_size.y) {\n";
+  c += "  if(X + 1 < args.dst_tensor.Width() && Y + 0 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r1);\n";
-  context = {"result", "X + 1", "Y + 0", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 1", "Y + 0", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 1, Y + 0, S)\n";
   c += "  }\n";
-  c += "  if(X + 0 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "  if(X + 0 < args.dst_tensor.Width() && Y + 1 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r2);\n";
-  context = {"result", "X + 0", "Y + 1", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 0", "Y + 1", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 0, Y + 1, S)\n";
   c += "  }\n";
-  c += "  if(X + 1 < dst_size.x && Y + 1 < dst_size.y) {\n";
+  c += "  if(X + 1 < args.dst_tensor.Width() && Y + 1 < "
+       "args.dst_tensor.Height()) {\n";
   c += "    FLT4 result = TO_FLT4(r3);\n";
-  context = {"result", "X + 1", "Y + 1", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X + 1", "Y + 1", "Z") + "\n";
+  c += "    args.dst_tensor.Write(result, X + 1, Y + 1, S)\n";
   c += "  }\n";
   c += "}\n";
 
@@ -277,9 +267,6 @@ DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3&& operation)
     : GPUOperation(std::move(operation)),
       weights_are_buffer_(operation.weights_are_buffer_),
       local_mem_uploads_(operation.local_mem_uploads_),
-      weights_tex2d_(std::move(operation.weights_tex2d_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      weights_(operation.weights_),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
@@ -287,9 +274,6 @@ DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
   if (this != &operation) {
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     std::swap(local_mem_uploads_, operation.local_mem_uploads_);
-    weights_tex2d_ = std::move(operation.weights_tex2d_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    std::swap(weights_, operation.weights_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -300,8 +284,15 @@ DepthwiseConv3x3& DepthwiseConv3x3::operator=(DepthwiseConv3x3&& operation) {
 absl::Status DepthwiseConv3x3::Compile(
     const CreationContext& creation_context) {
   std::string code = GenerateDepthwiseConvCode(
-      definition_, linked_operations_, *creation_context.device,
-      weights_are_buffer_, local_mem_uploads_);
+      definition_, *creation_context.device, weights_are_buffer_,
+      local_mem_uploads_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
+
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
@@ -313,13 +304,10 @@ absl::Status DepthwiseConv3x3::Compile(
 }
 
 absl::Status DepthwiseConv3x3::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 DepthwiseConv3x3::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
index 1ab17e3048c..9cb2ac41c87 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3x3.h
@@ -71,9 +71,6 @@ class DepthwiseConv3x3 : public GPUOperation {
 
   bool weights_are_buffer_;
   bool local_mem_uploads_;
-  Texture2D weights_tex2d_;
-  Buffer weights_buf_;
-  cl_mem weights_;
 
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
@@ -90,17 +87,19 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
+  Texture2D weights_tex2d;
+  Buffer weights_buf;
   if (fp32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(gpu_data));
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d_));
+          gpu_data.data(), context, &weights_tex2d));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -108,18 +107,27 @@ absl::Status DepthwiseConv3x3::UploadWeightsAndBiases(
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
           definition_.GetDataType(), texture_width, texture_height,
-          gpu_data.data(), context, &weights_tex2d_));
+          gpu_data.data(), context, &weights_tex2d));
     }
   }
 
   if (weights_are_buffer_) {
-    weights_ = weights_buf_.GetMemoryPtr();
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
   } else {
-    weights_ = weights_tex2d_.GetMemoryPtr();
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 0c3a1e3508c..ef49f67cf77 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -172,6 +172,10 @@ absl::Status TensorDescriptor::PerformSelector(
     return PerformWriteLinearSelector(args, result);
   } else if (selector == "GetAddress") {
     return PerformGetAddressSelector(args, result);
+  } else if (selector == "GetPtrWithSliceOffset") {
+    return PerformGetPtrWithSliceOffsetSelector(args, result);
+  } else if (selector == "GetWHOffset") {
+    return PerformGetWHOffsetSelector(args, result);
   } else {
     return absl::NotFoundError(absl::StrCat(
         "TensorDescriptor don't have selector with name - ", selector));
@@ -351,6 +355,43 @@ absl::Status TensorDescriptor::PerformGetAddressSelector(
   return absl::OkStatus();
 }
 
+absl::Status TensorDescriptor::PerformGetPtrWithSliceOffsetSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (storage_type != TensorStorageType::BUFFER) {
+    return absl::InvalidArgumentError(
+        "GetPtrWithSliceOffset selector can be used only with BUFFER");
+  }
+  if (args.size() != 1) {
+    return absl::NotFoundError(absl::StrCat(
+        "GetPtrWithSliceOffset require one argument(slice coordinate), but ",
+        args.size(), " was passed"));
+  }
+  const std::string width = IsBatchedWidth() ? "width_batched" : "width";
+  if (HasAxis(Axis::DEPTH)) {
+    *result =
+        absl::StrCat("buffer + ", args[0], " * ", width, " * height * depth");
+  } else {
+    *result = absl::StrCat("buffer + ", args[0], " * ", width, " * height");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::PerformGetWHOffsetSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (storage_type != TensorStorageType::BUFFER) {
+    return absl::InvalidArgumentError(
+        "GetWHOffset selector can be used only with BUFFER");
+  }
+  if (args.size() != 2) {
+    return absl::NotFoundError(absl::StrCat(
+        "GetWHOffset require two arguments(X and Y coordinates), but ",
+        args.size(), " was passed"));
+  }
+  const std::string width = IsBatchedWidth() ? "width_batched" : "width";
+  *result = absl::StrCat(args[1], " * ", width, " + ", args[0]);
+  return absl::OkStatus();
+}
+
 std::string TensorDescriptor::DeclareAddress(const std::string& var_name,
                                              const std::string& address) const {
   return absl::StrCat(StorageTypeToAddressType(), " ", var_name, " = ", address,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 3a1d7abb01a..12c078f1025 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -85,6 +85,12 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   absl::Status PerformGetAddressSelector(const std::vector<std::string>& args,
                                          std::string* result) const;
 
+  absl::Status PerformGetPtrWithSliceOffsetSelector(
+      const std::vector<std::string>& args, std::string* result) const;
+
+  absl::Status PerformGetWHOffsetSelector(const std::vector<std::string>& args,
+                                          std::string* result) const;
+
   std::string DeclareAddress(const std::string& var_name,
                              const std::string& address) const;
 
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index eaf7d8f6f03..e0d3d39f719 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -180,6 +180,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "prelu_tester",
+    testonly = 1,
+    srcs = ["prelu_tester.cc"],
+    hdrs = ["prelu_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "reduce_tester",
     testonly = 1,
@@ -527,6 +544,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "prelu_test",
+    srcs = ["prelu_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":prelu_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
index f1d9efd7209..191dc938e89 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_test.cc b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
new file mode 100644
index 00000000000..10026915add
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_test.cc
@@ -0,0 +1,583 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({batch, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 4DBy4DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, 1, 1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, 1, width, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastHeight) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, height, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({batch, 1, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy4DBroadcastHeightWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({1, height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({height, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 4DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_4DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({batch, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 3DBy3DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, 1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3DBroadcastWidth) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, width, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({batch, 1, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy3DBroadcastWidthChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({1, width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({width, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 3DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_3DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, width, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_2DBy2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({batch, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 2DBy2DBroadcastChannels) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({1, channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_2DBy2DBroadcastBatch) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({batch, 1})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 2DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_2DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, channels})
+      .SlopeShape({})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, 1DBy1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  PreluTester().InputShape({batch}).SlopeShape({batch}).Test(
+      xnnpack_delegate.get());
+}
+
+// TODO(b/159727692)
+TEST(Prelu, DISABLED_1DBy0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  PreluTester().InputShape({batch}).SlopeShape({}).Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .SparseWeights()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Prelu, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  PreluTester()
+      .InputShape({batch, height, width, channels})
+      .SlopeShape({channels})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
new file mode 100644
index 00000000000..ab20c2c51dc
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -0,0 +1,237 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/prelu_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <fp16.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void PreluTester::Test(TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f),
+                             std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + ComputeSize(InputShape()),
+                std::ref(input_rng));
+
+  float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
+            xnnpack_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_EQ(default_output_data[i], xnnpack_output_data[i]);
+  }
+}
+
+std::vector<char> PreluTester::CreateTfLiteModel() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto slope_rng = std::bind(std::uniform_real_distribution<float>(0.25f, 0.5f),
+                             std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_PRELU)}};
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  } else if (SparseWeights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DENSIFY));
+  }
+
+  std::vector<flatbuffers::Offset<Buffer>> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+  }};
+
+  if (FP16Weights()) {
+    std::vector<uint16_t> slope_data(ComputeSize(SlopeShape()));
+    std::generate(slope_data.begin(), slope_data.end(),
+                  std::bind(fp16_ieee_from_fp32_value, slope_rng));
+
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(slope_data.data()),
+                     sizeof(uint16_t) * slope_data.size())));
+  } else {
+    std::vector<float> slope_data(ComputeSize(SlopeShape()));
+    std::generate(slope_data.begin(), slope_data.end(), slope_rng);
+
+    buffers.push_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(slope_data.data()),
+                     sizeof(float) * slope_data.size())));
+  }
+
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+  } else if (SparseWeights()) {
+    const int dims_count = SlopeShape().size();
+    std::vector<flatbuffers::Offset<DimensionMetadata>> dim_metadata(
+        dims_count);
+    std::vector<int> traversal_order(dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      traversal_order[i] = i;
+      dim_metadata[i] = CreateDimensionMetadata(builder, DimensionType_DENSE,
+                                                SlopeShape()[i]);
+    }
+    const flatbuffers::Offset<SparsityParameters> sparsity_param =
+        CreateSparsityParameters(builder, builder.CreateVector(traversal_order),
+                                 0, builder.CreateVector(dim_metadata));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+        TensorType_FLOAT32, /*buffer=*/1, /*name=*/0, /*quantization=*/0,
+        /*is_variable=*/false, /*sparsity=*/sparsity_param));
+  }
+  if (FP16Weights()) {
+    const std::array<int32_t, 1> dequantize_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_outputs{{2}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_inputs.data(),
+                                      dequantize_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_outputs.data(),
+                                      dequantize_outputs.size())));
+  } else if (SparseWeights()) {
+    const std::array<int32_t, 1> densify_inputs{{0}};
+    const std::array<int32_t, 1> densify_outputs{{2}};
+    operators.emplace_back(
+        CreateOperator(builder, /*opcode_index=*/1,
+                       builder.CreateVector<int32_t>(densify_inputs.data(),
+                                                     densify_inputs.size()),
+                       builder.CreateVector<int32_t>(densify_outputs.data(),
+                                                     densify_outputs.size())));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(SlopeShape().data(), SlopeShape().size()),
+      TensorType_FLOAT32,
+      /*buffer=*/(FP16Weights() || SparseWeights()) ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(OutputShape().data(), OutputShape().size()),
+      TensorType_FLOAT32));
+
+  const std::array<int32_t, 2> op_inputs{
+      {static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size())));
+
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int32_t>(tensors.size() - 3)}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int32_t>(tensors.size()) - 1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("PReLU model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t PreluTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.h b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
new file mode 100644
index 00000000000..e89bae6029b
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PreluTester {
+ public:
+  PreluTester() = default;
+  PreluTester(const PreluTester&) = delete;
+  PreluTester& operator=(const PreluTester&) = delete;
+
+  inline PreluTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PreluTester& SlopeShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    slope_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& SlopeShape() const { return slope_shape_; }
+
+  inline const std::vector<int32_t>& OutputShape() const {
+    return InputShape();
+  }
+
+  inline PreluTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline PreluTester& SparseWeights() {
+    sparse_weights_ = true;
+    return *this;
+  }
+
+  inline bool SparseWeights() const { return sparse_weights_; }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> slope_shape_;
+  bool fp16_weights_ = false;
+  bool sparse_weights_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PRELU_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 0afc9c32122..31468ef7407 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2266,7 +2266,8 @@ class Subgraph {
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, input_tensor, node->inputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
                                            node->inputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, input_tensor, node->inputs->data[0], node_index));
@@ -2284,7 +2285,8 @@ class Subgraph {
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, output_tensor, node->outputs->data[0], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
                                            node->outputs->data[0]));
     TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
         logging_context, output_tensor, node->outputs->data[0], node_index));
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 6c454fab921..715e0c8431b 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -86,6 +86,8 @@ upper_tabs:
         path: /lite/convert/rnn
       - title: "Add metadata"
         path: /lite/convert/metadata
+      - title: "Composite operation fusion"
+        path: /lite/convert/operation_fusion
       - title: "1.x compatibility"
         path: /lite/convert/1x_compatibility
 
diff --git a/tensorflow/lite/g3doc/convert/operation_fusion.md b/tensorflow/lite/g3doc/convert/operation_fusion.md
new file mode 100644
index 00000000000..74af2a08b81
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/operation_fusion.md
@@ -0,0 +1,270 @@
+# TensorFlow operation fusion
+
+## Overview
+
+This page describes the design and steps needed to convert composite operations
+in TensorFlow to fused operations in TensorFlow Lite. This infrastructure is
+general purpose and supports conversion of any composite operation in TensorFlow
+to a corresponding fused operation in TensorFlow Lite.
+
+An example use of this infrastructure is TensorFlow RNN operation fusion to
+TensorFlow Lite, as detailed
+[here](https://www.tensorflow.org/lite/convert/rnn).
+
+### What are fused operations
+
+![drawing](../images/convert/op_fusion_banner.jpg)
+
+TensorFlow operations can either be primitive ops e.g.
+[tf.add](https://www.tensorflow.org/api_docs/python/tf/math/add) or they can be
+composed from other primitive operations e.g.
+[tf.einsum](https://www.tensorflow.org/api_docs/python/tf/einsum). A primitive
+operation shows up as a single node in the TensorFlow graph while.a composite
+operation is a collection of nodes in the TensorFlow graph. Executing a
+composite operation is equivalent to executing each of its constituent primitive
+operations.
+
+A fused operation corresponds to a single operation that subsumes all the
+computation performed by each primitive operation within the corresponding
+composite operation.
+
+### Benefits of fused operations
+
+Fused operations exist to maximize the performance of their underlying kernel
+implementations, by optimizing the overall computation and reducing memory
+footprint. This is very valuable, especially for low-latency inference workloads
+and resource constrained mobile platforms.
+
+Fused operations also provide a higher level interface to define complex
+transformations like quantization, which would otherwise be infeasible or very
+hard to do at a more granular level.
+
+TensorFlow Lite has many instances of fused operations for the reasons
+articulated above. These fused operations typically correspond to composite
+operations in the source TensorFlow program. Examples of composite operations in
+TensorFlow that are implemented as a single fused operation in TensorFlow Lite
+include various RNN operations like Unidirectional and Bidirectional sequence
+LSTM, convolution (conv2d, bias add, relu), fully connected (matmul, bias add,
+relu) and more. In TensorFlow Lite, LSTM quantization is currently only
+implemented in the fused LSTM operations.
+
+### Challenges with fused operations
+
+Converting composite operations from TensorFlow to fused operations in
+TensorFlow Lite is a hard problem. This is because:
+
+1.  Composite operations are represented in the TensorFlow graph as a set of
+    primitive operations without a well defined boundary. It can be very
+    challenging to identify (e.g. via pattern matching) the sub-graph
+    corresponding to such a composite operation.
+
+1.  There may be more than one TensorFlow implementation targeting a fused
+    TensorFlow Lite operation. For example, there are many LSTM implementations
+    in TensorFlow (Keras, Babelfish/lingvo etc) and each of these is composed of
+    different primitive operations but they all could still be converted to the
+    same fused LSTM operation in TensorFlow Lite.
+
+As such, conversion of fused operations has proven quite challenging.
+
+## Converting from composite to fused operation
+
+The overall architecture for converting TensorFlow composite operations to
+TensorFlow Lite fused operations is below:
+
+![drawing](../images/convert/op_fusion.png)
+
+### Wrap the composite operation in a `tf.function`
+
+In the TensorFlow model source code, identify and abstract out the composite
+operation into a `tf.function` with the
+[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/function.py#L88)
+function annotation. See an example of [embedding lookup](#composing_ops). The
+function defines the interface and its arguments should be used to implement the
+conversion logic.
+
+### Write conversion code
+
+The conversion code is written per the interface of the function with the
+`implements` annotation. See an example fusion for
+[embedding lookup](#fusion_code). Conceptually, the conversion code replaces the
+composite implementation of this interface with the fused one.
+
+In the prepare-composite-functions pass, plugin in your
+[conversion code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+
+In more advanced usages, it is possible to implement complex transformations of
+the composite operation's operands in order to derive the operands of the fused
+operation. See
+[Keras LSTM](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
+conversion code as an example.
+
+### Convert to TensorFlow Lite
+
+Use the
+[TFLiteConverter.from_saved_model](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter#from_saved_model)
+API to convert to TensorFlow Lite.
+
+## Under the hood
+
+<a id="under_the_hood"></a>
+
+We now describe high level details of the overall design in converting to fused
+operations in TensorFlow Lite.
+
+### Composing operations in TensorFlow
+
+<a id="composing_ops"></a>
+
+The use of `tf.function` with the
+[experimental\_implements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/function.py#L88)
+function attribute allows users to explicitly compose new operations using
+TensorFlow primitive operations and specify the interface that the resultant
+composite operation implements. This is very useful as it provides:
+
+1.  A well-defined boundary for the composite operation in the underlying
+    TensorFlow graph.
+1.  Explicitly specify the interface that this operation implements. The
+    arguments of the `tf.function` correspond to the arguments of this
+    interface.
+
+As an example, let’s consider a composite operation defined in
+[Lingvo/TensorFlow](https://github.com/tensorflow/lingvo) to implement embedding
+lookup. This maps to a fused operation in TensorFlow Lite.
+
+```python
+  @tf.function(
+        experimental_implements="lingvo.embedding_lookup")
+    def EmbFprop(embs, ids_vec):
+      """Embedding forward prop.
+
+      Effectively, it computes:
+        num = size of ids_vec
+        rets = zeros([num, embedding dim])
+        for i in range(num):
+          rets[i, :] = embs[ids_vec[i], :]
+        return rets
+
+      Args:
+        embs: The embedding matrix.
+        ids_vec: A vector of int32 embedding ids.
+
+      Returns:
+        The result of embedding lookups. A matrix of shape
+        [num ids in ids_vec, embedding dims].
+      """
+      num = tf.shape(ids_vec)[0]
+      rets = inplace_ops.empty([num] + emb_shape_suf, py_utils.FPropDtype(p))
+
+      def EmbFpropLoop(i, embs, ids_vec, rets):
+        # row_id = ids_vec[i]
+        row_id = tf.gather(ids_vec, i)
+        # row = embs[row_id]
+        row = tf.reshape(tf.gather(embs, row_id), [1] + emb_shape_suf)
+        # rets[i] = row
+        rets = inplace_ops.alias_inplace_update(rets, [i], row)
+        return embs, ids_vec, rets
+
+      _, _, rets = functional_ops.For(
+          start=0,
+          limit=num,
+          delta=1,
+          inputs=[embs, ids_vec, rets],
+          body=EmbFpropLoop,
+          rewrite_with_while=compiled)
+      if len(weight_shape) > 2:
+        rets = tf.reshape(rets, [num, symbolic.ToStatic(p.embedding_dim)])
+      return rets
+```
+
+By making models use composite operations via `tf.function` as illustrated
+above, it becomes possible to build a general infrastructure to **identify and
+convert** such operations to fused TensorFlow Lite operations.
+
+### Extending the TensorFlow Lite converter
+
+The TensorFlow Lite converter that was released earlier this year only supported
+importing TensorFlow models as a graph with all variables replaced with their
+corresponding constant values. This does not work for operation fusion since
+such graphs have all functions inlined so that the variables can be turned into
+constants.
+
+In order to leverage the `tf.function` with the `experimental_implements`
+feature during the conversion process, the functions need to be preserved until
+later in the conversion process.
+
+As such, we implemented a new workflow of importing and converting TensorFlow
+models in the converter to support the composite operation fusion use case.
+Specifically, the new features added are:
+
+1.  Importing TensorFlow
+    [saved models into MLIR](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L3593)
+1.  [fuse composite operations](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L103)
+1.  [variable mutability analysis](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc#L43)
+1.  [freeze all read-only variables](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc#L44)
+
+This allows us to perform operation fusion using the functions representing the
+composite operations prior to function inlining and variable freezing.
+
+### Implementing operation fusion
+
+Let’s look at the operation fusion pass in more detail. This pass does the
+following:
+
+1.  Loop through all functions in the MLIR module.
+1.  If a function has the tf.\_implements attribute, based on the attribute
+    value, calls the appropriate operation fusion utility.
+1.  The operation fusion utility operates on the function’s operands and
+    attributes (which serve as the interface for the conversion) and replaces
+    the body of the function with an equivalent function body containing the
+    fused operation.
+1.  In many cases, the replaced body will contain operations other than the
+    fused operation. These correspond to some static transforms on the
+    function’s operands in order to obtain the operands of the fused operation.
+    Since these computations can all be constant folded away, they would not be
+    present in the exported flatbuffer where only the fused operation would
+    exist.
+
+Here is code snippet from the pass showing the main workflow:
+
+```
+void PrepareCompositeFunctionsPass::ConvertTFImplements(FuncOp func,
+                                                        StringAttr attr) {
+  if (attr.getValue() == "lingvo.embedding_lookup") {
+    func.eraseBody();
+    func.addEntryBlock();
+    // Convert the composite embedding_lookup function body to a
+    // TFLite fused embedding_lookup op.
+    ConvertEmbeddedLookupFunc convert_embedded_lookup(func);
+    if (failed(convert_embedded_lookup.VerifySignature())) {
+      return signalPassFailure();
+    }
+    convert_embedded_lookup.RewriteFunc();
+  } else if (attr.getValue() == mlir::TFL::kKerasLstm) {
+     func.eraseBody();
+     func.addEntryBlock();
+     OpBuilder builder(func.getBody());
+     if (failed(ConvertKerasLSTMLayer(func, &builder))) {
+       return signalPassFailure();
+     }
+  } else if (.....) /* Other fusions can plug in here */
+}
+```
+
+Here is code snippet showing mapping this composite operation to a fused
+operation in TensorFlow Lite leveraging the function as a conversion interface.
+
+<a id="fusion_code"></a>
+
+```C++
+void RewriteFunc() {
+    Value lookup = func_.getArgument(1);
+    Value value = func_.getArgument(0);
+    auto output_type = func_.getType().getResult(0);
+
+    OpBuilder builder(func_.getBody());
+    auto op = builder.create<mlir::TFL::EmbeddingLookupOp>(
+        func_.getLoc(), output_type, lookup, value);
+
+    builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResult());
+  }
+```
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 3171306af13..0c43a795514 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -30,8 +30,8 @@ This document contains [example usages](#examples) of the API and
 ### Converting a SavedModel <a name="saved_model"></a>
 
 The following example shows how to convert a
-[SavedModel](https://www.tensorflow.org/guide/saved_model) into a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
+[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
+Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
 
 ```python
 import tensorflow as tf
@@ -97,6 +97,24 @@ with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 
+If your model requires specifying the input shape, use `tf.keras.layers.Input`
+or `tf.keras.layers.InputLayer` to create a Keras model with a fixed input shape
+as seen below or use the [`from_concrete_functions`](#concrete_function)
+classmethod as shown in the prior section to set the shape of the input arrays
+prior to conversion.
+
+```python
+input = tf.keras.layers.Input(shape=(1), batch_size=1)
+dense_layer = tf.keras.layers.Dense(units=1, input_shape=[1])
+model = tf.keras.Model(input, dense_layer(input))
+```
+
+```python
+model = tf.keras.models.Sequential(
+    [tf.keras.layers.InputLayer(input_shape=(1), batch_size=1),
+     tf.keras.layers.Dense(units=1, input_shape=[1])])
+```
+
 ### Converting a concrete function <a name="concrete_function"></a>
 
 The following example shows how to convert a TensorFlow
diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 734992c0904..ce9cf91f867 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -3,9 +3,9 @@
 ## Overview
 
 TensorFlow Lite supports converting TensorFlow RNN models to TensorFlow Lite’s
-fused LSTM operators. Fused operators exist to maximize the performance of their
-underlying kernel implementations, as well as provide a higher level interface
-to define complex transformations like quantizatization.
+fused LSTM operations. Fused operations exist to maximize the performance of
+their underlying kernel implementations, as well as provide a higher level
+interface to define complex transformations like quantizatization.
 
 Since there are many variants of RNN APIs in TensorFlow, our approach has been
 two fold:
@@ -23,15 +23,16 @@ two fold:
 
 ## Converter API
 
-Currently this feature is available through the
-[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head. This will
-be available in the TensorFlow 2.3 release.
+The feature is part of TensorFlow 2.3 release. It is also available through the
+[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head.
 
 This conversion functionality is available when converting to TensorFlow Lite
 via a SavedModel or from the Keras model directly. See example usages.
 
 ### From saved model
 
+<a id="from_saved_model"></a>
+
 ```
 # build a saved model. Here concrete_function is the exported function
 # corresponding to the TensorFlow model containing one or more
@@ -64,6 +65,8 @@ illustrates the end to end usage with the TensorFlow Lite interpreter.
 
 ## TensorFlow RNNs APIs supported
 
+<a id="rnn_apis"></a>
+
 ### Keras LSTM conversion (recommended)
 
 We support out-of-the-box conversion of Keras LSTM to TensorFlow Lite. For
@@ -75,13 +78,17 @@ details on how this works please refer to the
 Also important is to highlight the TensorFlow Lite’s LSTM contract with respect
 to the Keras operation definition:
 
-1.  The dimension 0 of the input tensor is the batch size.
-1.  The dimension 0 of the recurrent\_weight tensor is the number of outputs.
+1.  The dimension 0 of the **input** tensor is the batch size.
+1.  The dimension 0 of the **recurrent\_weight** tensor is the number of
+    outputs.
 1.  The **weight** and **recurrent\_kernel** tensors are transposed.
-1.  The transposed weight, transposed recurrent\_kernel and bias tensors are
+1.  The transposed weight, transposed recurrent\_kernel and **bias** tensors are
     split into 4 equal sized tensors along the dimension 0. These correspond to
     **input gate, forget gate, cell, and output gate**.
 
+See the detailed conversion code from Keras LSTM to TensorFlow Lite
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
+
 #### Keras LSTM Variants
 
 ##### Time major
@@ -98,7 +105,7 @@ forward and one for backward, see examples
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/layers/wrappers.py#L381).
 Once we see the go\_backward attribute, we recognize it as backward LSTM, then
 we group forward & backward LSTM together. **This is future work.** Currently,
-this creates two UnidirectionalSequenceLSTM operators in the TensorFlow Lite
+this creates two UnidirectionalSequenceLSTM operations in the TensorFlow Lite
 model.
 
 ### User-defined LSTM conversion examples
@@ -134,7 +141,7 @@ MLIR-pass
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
 The function’s interface should be treated like an API contract and should
 contain the arguments needed to convert to fused TensorFlow Lite LSTM
-operators - i.e. input, bias, weights, projection, layer normalization, etc. It
+operations - i.e. input, bias, weights, projection, layer normalization, etc. It
 is preferable for the tensors passed as arguments to this function to have known
 rank (i.e. RankedTensorType in MLIR). This makes it much easier to write
 conversion code that can assume these tensors as RankedTensorType and helps
@@ -189,5 +196,5 @@ follows:
     the user program. Such a TensorFlow program can still be converted to
     TensorFlow Lite using the feature being described here.
 1.  Bidirectional LSTM is currently modelled as two UnidirectionalSequenceLSTM
-    operators in TensorFlow Lite. This will be replaced with a single
+    operations in TensorFlow Lite. This will be replaced with a single
     BidirectionalSequenceLSTM op.
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index 054b7e0e275..d1462cb09c7 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -1147,11 +1147,8 @@ models:
 *   `CALL`
 *   `CONCAT_EMBEDDINGS`
 *   `CUSTOM`
-*   `EMBEDDING_LOOKUP`
 *   `EMBEDDING_LOOKUP_SPARSE`
 *   `HASHTABLE_LOOKUP`
 *   `LSH_PROJECTION`
-*   `LSTM`
-*   `RNN`
 *   `SKIP_GRAM`
 *   `SVDF`
diff --git a/tensorflow/lite/g3doc/images/convert/op_fusion.png b/tensorflow/lite/g3doc/images/convert/op_fusion.png
new file mode 100644
index 00000000000..bfee4acb53c
Binary files /dev/null and b/tensorflow/lite/g3doc/images/convert/op_fusion.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/op_fusion_banner.jpg b/tensorflow/lite/g3doc/images/convert/op_fusion_banner.jpg
new file mode 100644
index 00000000000..e7b1503774e
Binary files /dev/null and b/tensorflow/lite/g3doc/images/convert/op_fusion_banner.jpg differ
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index 464a5d1b5ef..37b2395dec6 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -49,7 +49,7 @@
       "metadata": {
         "colab_type": "text",
         "id": "nDABAblytltI"
-      }, 
+      },
       "source": [
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
         "  \u003ctd\u003e\n",
@@ -93,7 +93,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -116,7 +116,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -131,6 +131,7 @@
         "\n",
         "from tensorflow_examples.lite.model_maker.core.data_util.image_dataloader import ImageClassifierDataLoader\n",
         "from tensorflow_examples.lite.model_maker.core.task import image_classifier\n",
+        "from tensorflow_examples.lite.model_maker.core.task.configs import QuantizationConfig\n",
         "from tensorflow_examples.lite.model_maker.core.task.model_spec import mobilenet_v2_spec\n",
         "from tensorflow_examples.lite.model_maker.core.task.model_spec import ImageModelSpec\n",
         "\n",
@@ -161,7 +162,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -221,7 +222,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -245,7 +246,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -268,7 +269,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -294,7 +295,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -370,7 +371,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -398,7 +399,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -421,7 +422,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -445,7 +446,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -478,7 +479,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -501,7 +502,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -526,7 +527,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -549,7 +550,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -609,7 +610,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -644,7 +645,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -724,6 +725,83 @@
         "In this section, we describe several advanced topics, including switching to a different image classification model, changing the training hyperparameters etc.\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Gc4Jk8TvBQfm"
+      },
+      "source": [
+        "## Post-training quantization on the TensorFLow Lite model\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "tD8BOYrHBiDt"
+      },
+      "source": [
+        "[Post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization) is a conversion technique that can reduce model size and inference latency, while also improving CPU and hardware accelerator latency, with little degradation in model accuracy. Thus, it's widely used to optimize the model.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "iyIo0d5TCzE2"
+      },
+      "source": [
+        "Model Maker supports multiple post-training quantization options. Let's take full integer quantization as an instance. First, define the quantization config to enforce enforce full integer quantization for all ops including the input and output. The input type and output type are `uint8` by default. You may also change them to other types like `int8` by setting `inference_input_type` and `inference_output_type` in config."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "k8hL2mstCxQl"
+      },
+      "outputs": [],
+      "source": [
+        "config = QuantizationConfig.create_full_integer_quantization(representative_data=test_data, is_integer_only=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "K1gzx_rmFMOA"
+      },
+      "source": [
+        "Then we export TensorFlow Lite model with such configuration."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "WTJzFQnJFMjr"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.', tflite_filename='model_quant.tflite', quantization_config=config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Safo0e40wKZW"
+      },
+      "source": [
+        "In Colab, you can download the model named `model_quant.tflite` from the left sidebar, same as the uploading part mentioned above."
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -750,7 +828,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -773,7 +851,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -802,7 +880,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -871,7 +949,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -894,7 +972,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index c96f298370a..800d7008b4b 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -1892,61 +1892,70 @@ void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   }
 }
 
-void NeonCwiseClipping(int16_t* input, const int16_t clipping_value,
-                       int32_t n_batch, int32_t n_input) {
-  const int16x8_t max_dup = vdupq_n_s16(clipping_value);
-  const int16x8_t min_dup = vdupq_n_s16(-clipping_value);
-  for (int batch = 0; batch < n_batch; ++batch) {
-    int i = 0;
-    for (; i <= n_input - 16; i += 16) {
-      const int index = batch * n_input + i;
-      int16x8_t val_0 = vld1q_s16(input + index);
-      int16x8_t val_1 = vld1q_s16(input + index + 8);
-      val_0 = vminq_s16(val_0, max_dup);
-      val_1 = vminq_s16(val_1, max_dup);
-      val_0 = vmaxq_s16(val_0, min_dup);
-      val_1 = vmaxq_s16(val_1, min_dup);
-      vst1q_s16(input + index, val_0);
-      vst1q_s16(input + index + 8, val_1);
-    }
-    for (; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
+void NeonCwiseClipping(float* vector, const int v_size,
+                       const float clipping_value) {
+  const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
+  const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
+
+  int i = 0;
+  for (; i <= v_size - kFloatValuesPerNeonVector;
+       i += kFloatValuesPerNeonVector) {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + i);
+    // Clip between clipping_value and -clipping_value.
+    v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
+    v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(vector + i, v_f32x4);
+  }
+  for (; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
   }
 }
 
-void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
-                       int32_t n_batch, int32_t n_input) {
+void NeonCwiseClipping(int16_t* vector, const int v_size,
+                       const int16_t clipping_value) {
+  const int16x8_t max_dup = vdupq_n_s16(clipping_value);
+  const int16x8_t min_dup = vdupq_n_s16(-clipping_value);
+
+  int i = 0;
+  for (; i <= v_size - kInt16ValuesPerNeonVector * 2;
+       i += kInt16ValuesPerNeonVector * 2) {
+    int16x8_t val_0 = vld1q_s16(vector + i);
+    int16x8_t val_1 = vld1q_s16(vector + i + kInt16ValuesPerNeonVector);
+    val_0 = vminq_s16(val_0, max_dup);
+    val_1 = vminq_s16(val_1, max_dup);
+    val_0 = vmaxq_s16(val_0, min_dup);
+    val_1 = vmaxq_s16(val_1, min_dup);
+    vst1q_s16(vector + i, val_0);
+    vst1q_s16(vector + i + kInt16ValuesPerNeonVector, val_1);
+  }
+  for (; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<int16_t>(-clipping_value));
+  }
+}
+
+void NeonCwiseClipping(int8_t* vector, const int v_size,
+                       const int8_t clipping_value) {
   const int8x16_t max_dup = vdupq_n_s8(clipping_value);
   const int8x16_t min_dup = vdupq_n_s8(-clipping_value);
-  for (int batch = 0; batch < n_batch; ++batch) {
-    int i = 0;
-    for (; i <= n_input - 32; i += 32) {
-      const int index = batch * n_input + i;
-      int8x16_t val_0 = vld1q_s8(input + index);
-      int8x16_t val_1 = vld1q_s8(input + index + 16);
-      val_0 = vminq_s8(val_0, max_dup);
-      val_1 = vminq_s8(val_1, max_dup);
-      val_0 = vmaxq_s8(val_0, min_dup);
-      val_1 = vmaxq_s8(val_1, min_dup);
-      vst1q_s8(input + index, val_0);
-      vst1q_s8(input + index + 16, val_1);
-    }
-    for (; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
+
+  int i = 0;
+  for (; i < v_size - kInt8ValuesPerNeonVector * 2;
+       i += kInt8ValuesPerNeonVector * 2) {
+    int8x16_t val_0 = vld1q_s8(vector + i);
+    int8x16_t val_1 = vld1q_s8(vector + i + kInt8ValuesPerNeonVector);
+    val_0 = vminq_s8(val_0, max_dup);
+    val_1 = vminq_s8(val_1, max_dup);
+    val_0 = vmaxq_s8(val_0, min_dup);
+    val_1 = vmaxq_s8(val_1, min_dup);
+    vst1q_s8(vector + i, val_0);
+    vst1q_s8(vector + i + kInt8ValuesPerNeonVector, val_1);
+  }
+  for (; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<int8_t>(-clipping_value));
   }
 }
 
@@ -2208,34 +2217,6 @@ bool NeonIsZeroVector(const int8_t* vector, int v_size) {
   return true;
 }
 
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
-                    float* result) {
-  // If v_size is not divisible by the vector size, then we need to process the
-  // final few elements sequentially. postamble_start shows the start index
-  // where this should happen.
-  const int postamble_start =
-      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
-
-  // Replicate abs_limit and -abs_limit in two vectors.
-  const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
-  const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
-
-  int v = 0;
-  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
-    // Load from memory to vector.
-    float32x4_t v_f32x4 = vld1q_f32(vector + v);
-    // Clip between abs_limit and -abs_limit.
-    float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
-    result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
-    // Save to output.
-    vst1q_f32(result + v, result_f32x4);
-  }
-  // Postamble loop.
-  for (; v < v_size; v++) {
-    result[v] = std::max(std::min(abs_limit, vector[v]), -abs_limit);
-  }
-}
-
 void NeonVectorScalarMultiply(const int8_t* vector, const int v_size,
                               const float scale, float* result) {
   // Here the assumption is that each buffer is 4-byte aligned.
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 86951fcd559..7417e836b5c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -198,14 +198,17 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input) {
-  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
 }
-
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input) {
-  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
 }
 
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
@@ -255,10 +258,6 @@ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result) {
   NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
 }
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result) {
-  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
-}
 
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float* min_value,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 1554d07a61c..44bc83a0669 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -83,11 +83,12 @@ void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
 void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
                   int n_input, int16_t* output);
 
-void NeonCwiseClipping(int16_t* input, const int16_t clipping_value,
-                       int32_t n_batch, int32_t n_input);
-
-void NeonCwiseClipping(int8_t* input, const int8_t clipping_value,
-                       int32_t n_batch, int32_t n_input);
+void NeonCwiseClipping(float* vector, const int v_size,
+                       const float clipping_value);
+void NeonCwiseClipping(int16_t* vector, const int v_size,
+                       const int16_t clipping_value);
+void NeonCwiseClipping(int8_t* vector, const int v_size,
+                       const int8_t clipping_value);
 
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* input, const int32_t* bias,
@@ -133,10 +134,6 @@ void NeonSub1Vector(const float* vector, int v_size, float* result);
 
 void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result);
 
-// Clip elements of a vector using a abs_limit value.
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
-                    float* result);
-
 // Multiply all elements of vector with a scalar.
 void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                               float* result);
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 224d811e862..af29dda7229 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -206,14 +206,19 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
@@ -263,10 +268,6 @@ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result) {
   NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
 }
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result) {
-  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
-}
 
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float* min_value,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 4f6db290d4f..856331a62e7 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -651,36 +651,6 @@ void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
   }
 }
 
-void PortableCwiseClipping(int16_t* input, const int16_t clipping_value,
-                           int32_t n_batch, int32_t n_input) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    for (int i = 0; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
-  }
-}
-
-void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
-                           int32_t n_batch, int32_t n_input) {
-  for (int batch = 0; batch < n_batch; ++batch) {
-    for (int i = 0; i < n_input; ++i) {
-      const int index = batch * n_input + i;
-      if (input[index] > clipping_value) {
-        input[index] = clipping_value;
-      }
-      if (input[index] < -clipping_value) {
-        input[index] = -clipping_value;
-      }
-    }
-  }
-}
-
 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
                                      int v_size) {
   float result = 0.0;
@@ -757,13 +727,6 @@ void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
   }
 }
 
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result) {
-  for (int v = 0; v < v_size; v++) {
-    result[v] = std::max(std::min(abs_limit, vector[v]), -abs_limit);
-  }
-}
-
 void PortableReductionSumVector(const float* input_vector, float* output_vector,
                                 int output_size, int reduction_size) {
   const float* input_vector_ptr = input_vector;
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 0fd7a407595..ecb7fe8ea2b 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -230,14 +230,19 @@ void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
   PortableCwiseAdd(input_1, input_2, n_batch, n_input, output);
 }
 
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(float* vector, const int v_size,
+                   const float clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input) {
-  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
+}
+
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value) {
+  PortableCwiseClipping(vector, v_size, clipping_value);
 }
 
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
@@ -279,11 +284,6 @@ void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
   PortableVectorScalarMultiply(vector, v_size, scale, result);
 }
 
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result) {
-  PortableClipVector(vector, v_size, abs_limit, result);
-}
-
 void ReductionSumVector(const float* input_vector, float* output_vector,
                         int output_size, int reduction_size) {
   PortableReductionSumVector(input_vector, output_vector, output_size,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 34767ccd942..556e4640cbb 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
 
+#include <algorithm>
 #include <cstdint>
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
@@ -33,9 +34,6 @@ class CpuBackendContext;
 
 namespace tensor_utils {
 
-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
 template <typename T>
 bool PortableIsZeroVector(const T* vector, int v_size) {
   for (int i = 0; i < v_size; ++i) {
@@ -178,11 +176,14 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int16_t* output);
 
-void PortableCwiseClipping(int16_t* input, const int16_t clipping_value,
-                           int32_t n_batch, int32_t n_input);
-
-void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
-                           int32_t n_batch, int32_t n_input);
+template <typename T>
+void PortableCwiseClipping(T* vector, const int v_size,
+                           const T clipping_value) {
+  for (int i = 0; i < v_size; i++) {
+    vector[i] = std::max(std::min(clipping_value, vector[i]),
+                         static_cast<T>(-clipping_value));
+  }
+}
 
 // Batch vector initialization with another vector.
 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
@@ -201,10 +202,6 @@ void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                                   float* result);
 
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result);
-
 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
 // output_vector: float pointer to vector.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 8c956c49f5f..716fbaa740e 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -406,23 +406,16 @@ void CwiseMul(const int16_t* input_1, const int16_t* input_2,
 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int16_t* output);
 
-// Element-wise in-place clipping of a quantized vector.
-// Parameters:
-//     - input:          batch vector of size n_batch * n_input; 16 bit.
+// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
+// int8_t. Parameters:
+//     - vector:         vector of size v_size.
+//     - v_size:         the size of the vector.
 //     - clipping_value: the value used for clipping.
-//     - n_batch:        the number of batches.
-//     - n_input:        the size for input and output.
-void CwiseClipping(int16_t* input, const int16_t clipping_value,
-                   int32_t n_batch, int32_t n_input);
-
-// Element-wise in-place clipping of a quantized vector.
-// Parameters:
-//     - input:          batch vector of size n_batch * n_input; 8 bit.
-//     - clipping_value: the value used for clipping.
-//     - n_batch:        the number of batches.
-//     - n_input:        the size for input and output.
-void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
-                   int32_t n_input);
+void CwiseClipping(float* vector, const int v_size, const float clipping_value);
+void CwiseClipping(int16_t* vector, const int v_size,
+                   const int16_t clipping_value);
+void CwiseClipping(int8_t* vector, const int v_size,
+                   const int8_t clipping_value);
 
 // Cwise product of two vectors.
 template <typename T>
@@ -611,10 +604,6 @@ void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                           float* result);
 
-// Clip elements of a vector using a abs_limit value.
-void ClipVector(const float* vector, int v_size, float abs_limit,
-                float* result);
-
 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
 // output_vector: float pointer to vector.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 878cf0d2618..825070cf510 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -37,18 +37,6 @@ TEST(uKernels, FloorLog2Test) {
   }
 }
 
-TEST(uKernels, ClipTest) {
-  constexpr int kVectorSize = 10;
-  constexpr float kAbsLimit = 2.0;
-  static float input[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
-                                     -2.5, 3.0,  -3.5, 4.0,  -4.5};
-  std::vector<float> output(kVectorSize);
-  ClipVector(input, kVectorSize, kAbsLimit, output.data());
-  EXPECT_THAT(output,
-              ElementsAreArray(ArrayFloatNear(
-                  {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
-}
-
 TEST(uKernels, VectorScalarMultiply) {
   constexpr int kVectorSize = 29;
   static int8_t input[kVectorSize];
@@ -976,15 +964,28 @@ TEST(uKernels, QuantAddTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+TEST(uKernels, ClipTest) {
+  constexpr int kVectorSize = 10;
+  constexpr float kAbsLimit = 2.0;
+  std::vector<float> input = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                              -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  CwiseClipping(input.data(), kVectorSize, kAbsLimit);
+  const std::vector<float> expected_output = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                              -2.0, 2.0,  -2.0, 2.0,  -2.0};
+  EXPECT_THAT(input, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized clipping for 16 bit.
 TEST(uKernels, QuantClip16Test) {
+  constexpr int kVectorSize = 30;
+  constexpr int16_t kAbsLimit = 300;
   std::vector<int16_t> input = {
       -10500, 1,     -2,     -7404,  200,    -5401,  -1757, -7668,
       -19248, -9692, -24249, -17923, -15840, -10026, 5249,  -89,
       1787,   -200,  -6691,  -19524, -13439, -24048, -1123, 32767,
       -17267, -3378, 823,    11482,  -11139, 7508,
   };
-  CwiseClipping(input.data(), 300, 2, 15);
+  CwiseClipping(input.data(), kVectorSize, kAbsLimit);
   const std::vector<int16_t> expected_output = {
       -300, 1,    -2,   -300, 200,  -300, -300, -300, -300, -300,
       -300, -300, -300, -300, 300,  -89,  300,  -200, -300, -300,
@@ -995,11 +996,13 @@ TEST(uKernels, QuantClip16Test) {
 
 // Quantized clipping for 8 bit.
 TEST(uKernels, QuantClip8Test) {
+  constexpr int kVectorSize = 30;
+  constexpr int8_t kAbsLimit = 32;
   std::vector<int8_t> input = {
       4,   -11, -5, -34, -10, -17, -27, -22, 15,  127, -128, 1,  3, 56, 3,
       -21, 1,   9,  -13, 10,  0,   -1,  -55, -40, 127, -128, 11, 4, 6,  32,
   };
-  CwiseClipping(input.data(), 32, 2, 15);
+  CwiseClipping(input.data(), kVectorSize, kAbsLimit);
   const std::vector<int8_t> expected_output = {
       4,   -11, -5, -32, -10, -17, -27, -22, 15,  32, -32, 1,  3, 32, 3,
       -21, 1,   9,  -13, 10,  0,   -1,  -32, -32, 32, -32, 11, 4, 6,  32,
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index ca8344d863b..3f74f3e7fff 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -374,8 +374,8 @@ inline void LstmStepFloat(
         cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
   }
 
   // For each batch and cell: update the output gate.
@@ -415,8 +415,8 @@ inline void LstmStepFloat(
         projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
         output_state_ptr);
     if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
-                               params->proj_clip, output_state_ptr);
+      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
+                                  params->proj_clip);
     }
   } else {
     std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
@@ -837,8 +837,8 @@ inline void LstmStepHybrid(
         cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
   }
 
   // For each batch and cell: update the output gate.
@@ -893,8 +893,8 @@ inline void LstmStepHybrid(
           scaling_factors_scratch, context);
     }
     if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
-                               params->proj_clip, output_state_ptr);
+      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
+                                  params->proj_clip);
     }
   } else {
     std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
@@ -1187,8 +1187,8 @@ inline void LstmStepInteger8x8_16(
                          n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
-                                n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                quantized_cell_clip);
   }
 
   // Ouptut gate.
@@ -1234,8 +1234,8 @@ inline void LstmStepInteger8x8_16(
         effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
         n_output, output_state_zp, scratch5, output_ptr, context);
     if (quantized_proj_clip > 0) {
-      tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
-                                  n_output);
+      tensor_utils::CwiseClipping(output_ptr, n_batch * n_output,
+                                  quantized_proj_clip);
     }
   } else {
     std::copy_n(scratch4, n_batch * n_output, output_ptr);
@@ -1498,8 +1498,8 @@ inline void LstmStepInteger8x8_8(
   tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
-                                n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                quantized_cell_clip);
   }
 
   // Cell to hidden.
@@ -1517,8 +1517,8 @@ inline void LstmStepInteger8x8_8(
 
   // Projection clipping.
   if (quantized_proj_clip > 0) {
-    tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
-                                n_output);
+    tensor_utils::CwiseClipping(output_ptr, n_batch * n_output,
+                                quantized_proj_clip);
   }
 
   // Copy output to output state.
diff --git a/tensorflow/lite/micro/examples/network_tester/Makefile.inc b/tensorflow/lite/micro/examples/network_tester/Makefile.inc
index 27f54a66763..a5c911238c8 100644
--- a/tensorflow/lite/micro/examples/network_tester/Makefile.inc
+++ b/tensorflow/lite/micro/examples/network_tester/Makefile.inc
@@ -33,6 +33,10 @@ ifeq ($(COMPARE_OUTPUT_DATA),no)
   CXXFLAGS += -DNO_COMPARE_OUTPUT_DATA
 endif
 
+ifdef NUM_INFERENCES
+  CXXFLAGS += -DNUM_INFERENCES=$(NUM_INFERENCES)
+endif
+
 # Builds a standalone object recognition binary.
 $(eval $(call microlite_test,network_tester_test,\
 $(NETWORK_TESTER_TEST_SRCS),$(NETWORK_TESTER_TEST_HDRS)))
diff --git a/tensorflow/lite/micro/examples/network_tester/README.md b/tensorflow/lite/micro/examples/network_tester/README.md
index 7c4c48e4eb1..f4415222133 100644
--- a/tensorflow/lite/micro/examples/network_tester/README.md
+++ b/tensorflow/lite/micro/examples/network_tester/README.md
@@ -34,8 +34,18 @@ make -f tensorflow/lite/micro/tools/make/Makefile network_tester_test \
 `ARENA_SIZE`: The size of the memory to be allocated (in bytes) by the
 interpreter. \
 `NUM_BYTES_TO_PRINT`: The number of bytes of the output data to print. \
-Defaults to 0 if not specified. \
+If set to 0, all bytes of the output are printed. \
 `COMPARE_OUTPUT_DATA`: If set to "no" the output data is not compared to the
 expected output data. This could be useful e.g. if the execution time needs to
 be minimized, or there is no expected output data. If omitted, the output data
-is compared to the expected output.
+is compared to the expected output. `NUM_INFERENCES`: Define how many inferences
+that are made. Defaults to 1. \
+
+The output is printed in JSON format using printf: `num_of_outputs: 1
+output_begin [ { "dims": [4,1,2,2,1], "data_address": "0x000000",
+"data":"0x06,0x08,0x0e,0x10" }] output_end`
+
+If there are multiple output tensors, the output will look like this:
+`num_of_outputs: 2 output_begin [ { "dims": [4,1,2,2,1], "data_address":
+"0x000000", "data":"0x06,0x08,0x0e,0x10" }, { "dims": [4,1,2,2,1],
+"data_address": "0x111111", "data":"0x06,0x08,0x0e,0x10" }] output_end`
diff --git a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
index 03e21954b7f..934722bad94 100644
--- a/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/expected_output_data.h
@@ -17,6 +17,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
 
 static unsigned int expected_output_data_len = 4;
-static unsigned char expected_output_data[] = {6, 8, 14, 16};
+static unsigned char expected_output_data[1][4] = {6, 8, 14, 16};
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_EXPECTED_OUTPUT_DATA_H_
diff --git a/tensorflow/lite/micro/examples/network_tester/input_data.h b/tensorflow/lite/micro/examples/network_tester/input_data.h
index b47277cca93..a94f6f90139 100644
--- a/tensorflow/lite/micro/examples/network_tester/input_data.h
+++ b/tensorflow/lite/micro/examples/network_tester/input_data.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
 
 static const int input_data_len = 16;
-static const unsigned char input_data[] = {1, 2,  3,  4,  5,  6,  7,  8,
-                                           9, 10, 11, 12, 13, 14, 15, 16};
+static const unsigned char input_data[1][16] = {
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}};
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_NETWORK_TESTER_INPUT_DATA_H_
diff --git a/tensorflow/lite/micro/examples/network_tester/network_model.h b/tensorflow/lite/micro/examples/network_tester/network_model.h
index 4c275dbfbba..5b4b4cf3070 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_model.h
+++ b/tensorflow/lite/micro/examples/network_tester/network_model.h
@@ -1,8 +1,11 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
index 9295c87063b..e6eefe003f0 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -1,8 +1,11 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,29 +28,37 @@ limitations under the License.
 #define TENSOR_ARENA_SIZE (1024)
 #endif
 
+#ifndef NUM_INFERENCES
+#define NUM_INFERENCES 1
+#endif
+
 uint8_t tensor_arena[TENSOR_ARENA_SIZE];
 
 #ifdef NUM_BYTES_TO_PRINT
 inline void print_output_data(TfLiteTensor* output) {
   int num_bytes_to_print =
-      (output->bytes < NUM_BYTES_TO_PRINT) ? output->bytes : NUM_BYTES_TO_PRINT;
+      ((output->bytes < NUM_BYTES_TO_PRINT) || NUM_BYTES_TO_PRINT == 0)
+          ? output->bytes
+          : NUM_BYTES_TO_PRINT;
 
   int dims_size = output->dims->size;
-  printf("dims: {%d,", dims_size);
+  printf("{\n");
+  printf("\"dims\": [%d,", dims_size);
   for (int i = 0; i < output->dims->size - 1; ++i) {
     printf("%d,", output->dims->data[i]);
   }
-  printf("%d}\n", output->dims->data[dims_size - 1]);
+  printf("%d],\n", output->dims->data[dims_size - 1]);
 
-  printf("data_address: %p\n", output->data.raw);
-  printf("data:\n{");
+  printf("\"data_address\": \"%p\",\n", output->data.raw);
+  printf("\"data\":\"");
   for (int i = 0; i < num_bytes_to_print - 1; ++i) {
-    if (i % 16 == 0) {
+    if (i % 16 == 0 && i != 0) {
       printf("\n");
     }
     printf("0x%02x,", output->data.uint8[i]);
   }
-  printf("0x%02x\n}\n", output->data.uint8[num_bytes_to_print - 1]);
+  printf("0x%02x\"\n", output->data.uint8[num_bytes_to_print - 1]);
+  printf("}");
 }
 #endif
 
@@ -63,7 +74,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
                          "Model provided is schema version %d not equal "
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
-    return 1;
+    return kTfLiteError;
   }
 
   tflite::AllOpsResolver resolver;
@@ -74,29 +85,48 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TfLiteStatus allocate_status = interpreter.AllocateTensors();
   if (allocate_status != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(error_reporter, "Tensor allocation failed\n");
+    return kTfLiteError;
   }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocate_status);
 
-  TfLiteTensor* input = interpreter.input(0);
-  memcpy(input->data.uint8, input_data, input->bytes);
-
-  TfLiteStatus invoke_status = interpreter.Invoke();
-  if (invoke_status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
-  }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
-
-  TfLiteTensor* output = interpreter.output(0);
+  for (int n = 0; n < NUM_INFERENCES; n++) {
+    for (int i = 0; i < interpreter.inputs_size(); ++i) {
+      TfLiteTensor* input = interpreter.input(i);
+      memcpy(input->data.uint8, input_data[i], input->bytes);
+    }
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed\n");
+      return kTfLiteError;
+    }
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 
 #ifdef NUM_BYTES_TO_PRINT
-  print_output_data(output);
+    // Print all of the output data, or the first NUM_BYTES_TO_PRINT bytes,
+    // whichever comes first as well as the output shape.
+    printf("num_of_outputs: %d\n", interpreter.outputs_size());
+    printf("output_begin\n");
+    printf("[\n");
+    for (int i = 0; i < interpreter.outputs_size(); i++) {
+      TfLiteTensor* output = interpreter.output(i);
+      print_output_data(output);
+      if (i != interpreter.outputs_size() - 1) {
+        printf(",\n");
+      }
+    }
+    printf("]\n");
+    printf("output_end\n");
 #endif
 
 #ifndef NO_COMPARE_OUTPUT_DATA
-  for (int i = 0; i < output->bytes; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(output->data.uint8[i], expected_output_data[i]);
-  }
+    for (int i = 0; i < interpreter.outputs_size(); i++) {
+      TfLiteTensor* output = interpreter.output(i);
+      for (int j = 0; j < output->bytes; ++j) {
+        TF_LITE_MICRO_EXPECT_EQ(output->data.uint8[j],
+                                expected_output_data[i][j]);
+      }
+    }
 #endif
+  }
   TF_LITE_REPORT_ERROR(error_reporter, "Ran successfully\n");
 }
 
diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index b45de85a21b..c698f2c7115 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -31,7 +31,7 @@ namespace {
 // Ensure memory doesn't expand more that 3%:
 constexpr float kAllocationThreshold = 0.03;
 constexpr float kAllocationTailMiscCeiling = 1024;
-const bool kIs64BitSystem = sizeof(void*) == 8;
+const bool kIs64BitSystem = (sizeof(void*) == 8);
 
 constexpr int kKeywordModelTensorArenaSize = 22 * 1024;
 uint8_t keyword_model_tensor_arena[kKeywordModelTensorArenaSize];
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 1b76f440a61..8c99f77729d 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -108,31 +108,23 @@ class MicroMutableOpResolver : public MicroOpResolver {
   // MicroMutableOpResolver object.
 
   TfLiteStatus AddAbs() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ABS, *tflite::ops::micro::Register_ABS(),
-                      ParseOpData);
+                      ParseAbs);
   }
 
   TfLiteStatus AddAdd() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ADD, *tflite::ops::micro::Register_ADD(),
-                      ParseOpData);
+                      ParseAdd);
   }
 
   TfLiteStatus AddArgMax() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ARG_MAX,
-                      *tflite::ops::micro::Register_ARG_MAX(), ParseOpData);
+                      *tflite::ops::micro::Register_ARG_MAX(), ParseArgMax);
   }
 
   TfLiteStatus AddArgMin() {
-    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
-    // function.
     return AddBuiltin(BuiltinOperator_ARG_MIN,
-                      *tflite::ops::micro::Register_ARG_MIN(), ParseOpData);
+                      *tflite::ops::micro::Register_ARG_MIN(), ParseArgMin);
   }
 
   TfLiteStatus AddAveragePool2D() {
diff --git a/tensorflow/lite/micro/recording_micro_allocator.cc b/tensorflow/lite/micro/recording_micro_allocator.cc
index 05ccdbdbfaa..e667e7db9a9 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator.cc
@@ -110,37 +110,54 @@ void RecordingMicroAllocator::PrintRecordedAllocation(
 
 TfLiteStatus RecordingMicroAllocator::AllocateTfLiteTensorArray(
     TfLiteContext* context, const SubGraph* subgraph) {
-  SnapshotAllocationUsage(recorded_tflite_tensor_array_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status =
       MicroAllocator::AllocateTfLiteTensorArray(context, subgraph);
 
-  RecordAllocationUsage(recorded_tflite_tensor_array_data_);
-  recorded_tflite_tensor_array_data_.count = context->tensors_size;
+  RecordAllocationUsage(allocations, recorded_tflite_tensor_array_data_);
+  // The allocation for this recording will always be 1. This is because the
+  // parent class mallocs one large allocation for the number of tensors in the
+  // graph (e.g. sizeof(TfLiteTensor) * num_tensors).
+  // To prevent extra overhead and potential for fragmentation, manually adjust
+  // the accounting by decrementing by 1 and adding the actual number of tensors
+  // used in the graph:
+  recorded_tflite_tensor_array_data_.count += context->tensors_size - 1;
   return status;
 }
 
 TfLiteStatus RecordingMicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
     const Model* model, TfLiteContext* context, const SubGraph* subgraph) {
-  SnapshotAllocationUsage(recorded_tflite_tensor_array_quantization_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer(
       model, context, subgraph);
 
-  RecordAllocationUsage(recorded_tflite_tensor_array_quantization_data_);
+  RecordAllocationUsage(allocations,
+                        recorded_tflite_tensor_array_quantization_data_);
   return status;
 }
 
 TfLiteStatus RecordingMicroAllocator::AllocateNodeAndRegistrations(
     const SubGraph* subgraph, NodeAndRegistration** node_and_registrations) {
-  SnapshotAllocationUsage(recorded_node_and_registration_array_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::AllocateNodeAndRegistrations(
       subgraph, node_and_registrations);
 
-  RecordAllocationUsage(recorded_node_and_registration_array_data_);
-  recorded_node_and_registration_array_data_.count =
-      subgraph->operators()->size();
+  RecordAllocationUsage(allocations,
+                        recorded_node_and_registration_array_data_);
+  // The allocation count in SimpleMemoryAllocator will only be 1. To provide
+  // better logging, decrement by 1 and add in the actual number of operators
+  // used in the graph:
+  // The allocation for this recording will always be 1. This is because the
+  // parent class mallocs one large allocation for the number of nodes in the
+  // graph (e.g. sizeof(NodeAndRegistration) * num_nodes).
+  // To prevent extra overhead and potential for fragmentation, manually adjust
+  // the accounting by decrementing by 1 and adding the actual number of nodes
+  // used in the graph:
+  recorded_node_and_registration_array_data_.count +=
+      subgraph->operators()->size() - 1;
   return status;
 }
 
@@ -149,43 +166,45 @@ RecordingMicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
     const Model* model, const SubGraph* subgraph,
     const MicroOpResolver& op_resolver,
     NodeAndRegistration* node_and_registrations) {
-  SnapshotAllocationUsage(recorded_op_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status =
       MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
           model, subgraph, op_resolver, node_and_registrations);
 
-  RecordAllocationUsage(recorded_op_data_);
+  RecordAllocationUsage(allocations, recorded_op_data_);
   return status;
 }
 
 TfLiteStatus RecordingMicroAllocator::AllocateVariables(
     TfLiteContext* context, const SubGraph* subgraph) {
-  SnapshotAllocationUsage(recorded_tflite_tensor_variable_buffer_data_);
+  RecordedAllocation allocations = SnapshotAllocationUsage();
 
   TfLiteStatus status = MicroAllocator::AllocateVariables(context, subgraph);
 
-  RecordAllocationUsage(recorded_tflite_tensor_variable_buffer_data_);
+  RecordAllocationUsage(allocations,
+                        recorded_tflite_tensor_variable_buffer_data_);
   return status;
 }
 
-void RecordingMicroAllocator::SnapshotAllocationUsage(
-    RecordedAllocation& recorded_allocation) {
-  recorded_allocation.requested_bytes =
-      recording_memory_allocator_->GetRequestedBytes();
-  recorded_allocation.used_bytes = recording_memory_allocator_->GetUsedBytes();
-  recorded_allocation.count = recording_memory_allocator_->GetAllocatedCount();
+RecordedAllocation RecordingMicroAllocator::SnapshotAllocationUsage() const {
+  return {/*requested_bytes=*/recording_memory_allocator_->GetRequestedBytes(),
+          /*used_bytes=*/recording_memory_allocator_->GetUsedBytes(),
+          /*count=*/recording_memory_allocator_->GetAllocatedCount()};
 }
 
 void RecordingMicroAllocator::RecordAllocationUsage(
+    const RecordedAllocation& snapshotted_allocation,
     RecordedAllocation& recorded_allocation) {
-  recorded_allocation.requested_bytes =
+  recorded_allocation.requested_bytes +=
       recording_memory_allocator_->GetRequestedBytes() -
-      recorded_allocation.requested_bytes;
-  recorded_allocation.used_bytes = recording_memory_allocator_->GetUsedBytes() -
-                                   recorded_allocation.used_bytes;
-  recorded_allocation.count = recording_memory_allocator_->GetAllocatedCount() -
-                              recorded_allocation.count;
+      snapshotted_allocation.requested_bytes;
+  recorded_allocation.used_bytes +=
+      recording_memory_allocator_->GetUsedBytes() -
+      snapshotted_allocation.used_bytes;
+  recorded_allocation.count +=
+      recording_memory_allocator_->GetAllocatedCount() -
+      snapshotted_allocation.count;
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/recording_micro_allocator.h b/tensorflow/lite/micro/recording_micro_allocator.h
index b30b045cc34..a5b97c7ef3a 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.h
+++ b/tensorflow/lite/micro/recording_micro_allocator.h
@@ -36,12 +36,11 @@ enum class RecordedAllocationType {
 // type. Each recording contains the number of bytes requested, the actual bytes
 // allocated (can defer from requested by alignment), and the number of items
 // allocated.
-typedef struct RecordedAllocation {
-  RecordedAllocation() : requested_bytes(0), used_bytes(0), count(0) {}
+struct RecordedAllocation {
   size_t requested_bytes;
   size_t used_bytes;
   size_t count;
-} RecordedAllocation;
+};
 
 // Utility subclass of MicroAllocator that records all allocations
 // inside the arena. A summary of allocations can be logged through the
@@ -82,9 +81,6 @@ class RecordingMicroAllocator : public MicroAllocator {
   TfLiteStatus AllocateVariables(TfLiteContext* context,
                                  const SubGraph* subgraph) override;
 
-  void SnapshotAllocationUsage(RecordedAllocation& recorded_allocation);
-  void RecordAllocationUsage(RecordedAllocation& recorded_allocation);
-
  private:
   RecordingMicroAllocator(RecordingSimpleMemoryAllocator* memory_allocator,
                           ErrorReporter* error_reporter);
@@ -93,13 +89,17 @@ class RecordingMicroAllocator : public MicroAllocator {
                                const char* allocation_name,
                                const char* allocation_description) const;
 
+  RecordedAllocation SnapshotAllocationUsage() const;
+  void RecordAllocationUsage(const RecordedAllocation& snapshotted_allocation,
+                             RecordedAllocation& recorded_allocation);
+
   const RecordingSimpleMemoryAllocator* recording_memory_allocator_;
 
-  RecordedAllocation recorded_tflite_tensor_array_data_;
-  RecordedAllocation recorded_tflite_tensor_array_quantization_data_;
-  RecordedAllocation recorded_tflite_tensor_variable_buffer_data_;
-  RecordedAllocation recorded_node_and_registration_array_data_;
-  RecordedAllocation recorded_op_data_;
+  RecordedAllocation recorded_tflite_tensor_array_data_ = {};
+  RecordedAllocation recorded_tflite_tensor_array_quantization_data_ = {};
+  RecordedAllocation recorded_tflite_tensor_variable_buffer_data_ = {};
+  RecordedAllocation recorded_node_and_registration_array_data_ = {};
+  RecordedAllocation recorded_op_data_ = {};
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc
index 775a2de2dfd..8b8eaa20638 100644
--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@@ -43,12 +43,20 @@ TF_LITE_MICRO_TEST(TestRecordsTfLiteTensorArrayData) {
   tflite::RecordingMicroAllocator* micro_allocator =
       tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
                                               micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, micro_allocator);
-  TF_LITE_MICRO_EXPECT_GE(kTfLiteOk, micro_allocator->StartModelAllocation(
-                                         model, &context, all_ops_resolver,
-                                         &node_and_registration));
-  TF_LITE_MICRO_EXPECT_GE(
-      kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context));
+  // TODO(b/158102673): ugly workaround for not having fatal assertions. Same
+  // throughout this file.
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteStatus status;
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
 
   tflite::RecordedAllocation recorded_allocation =
       micro_allocator->GetRecordedAllocation(
@@ -70,12 +78,18 @@ TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
   tflite::RecordingMicroAllocator* micro_allocator =
       tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
                                               micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, micro_allocator);
-  TF_LITE_MICRO_EXPECT_GE(kTfLiteOk, micro_allocator->StartModelAllocation(
-                                         model, &context, all_ops_resolver,
-                                         &node_and_registration));
-  TF_LITE_MICRO_EXPECT_GE(
-      kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context));
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteStatus status;
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
 
   // Walk the model subgraph to find all tensors with quantization params and
   // keep a tally.
@@ -124,12 +138,18 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
   tflite::RecordingMicroAllocator* micro_allocator =
       tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
                                               micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, micro_allocator);
-  TF_LITE_MICRO_EXPECT_GE(kTfLiteOk, micro_allocator->StartModelAllocation(
-                                         model, &context, all_ops_resolver,
-                                         &node_and_registration));
-  TF_LITE_MICRO_EXPECT_GE(
-      kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context));
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  TfLiteStatus status;
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
 
   size_t num_ops = model->subgraphs()->Get(0)->operators()->size();
   tflite::RecordedAllocation recorded_allocation =
@@ -142,6 +162,55 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
                           num_ops * NODE_AND_REGISTRATION_STRUCT_SIZE);
 }
 
+TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
+  TfLiteContext context;
+  tflite::AllOpsResolver all_ops_resolver;
+  tflite::NodeAndRegistration* node_and_registration;
+  const tflite::Model* model = tflite::GetModel(kTestConvModelData);
+
+  // Double the arena size to allocate two models inside of it:
+  uint8_t arena[kTestConvArenaSize * 2];
+
+  TfLiteStatus status;
+
+  tflite::RecordingMicroAllocator* micro_allocator =
+      tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize * 2,
+                                              micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  // First allocation with the model in the arena:
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  // Second allocation with the same model in the arena:
+  status = micro_allocator->StartModelAllocation(
+      model, &context, all_ops_resolver, &node_and_registration);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  status = kTfLiteOk, micro_allocator->FinishModelAllocation(model, &context);
+  TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
+  if (status != kTfLiteOk) return 1;
+
+  tflite::RecordedAllocation recorded_allocation =
+      micro_allocator->GetRecordedAllocation(
+          tflite::RecordedAllocationType::kTfLiteTensorArray);
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, context.tensors_size * 2);
+  TF_LITE_MICRO_EXPECT_EQ(
+      recorded_allocation.requested_bytes,
+      context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE * 2);
+  TF_LITE_MICRO_EXPECT_GE(
+      recorded_allocation.used_bytes,
+      context.tensors_size * TF_LITE_TENSOR_STRUCT_SIZE * 2);
+}
+
 // TODO(b/158124094): Find a way to audit OpData allocations on
 // cross-architectures.
 
diff --git a/tensorflow/lite/micro/recording_micro_interpreter.h b/tensorflow/lite/micro/recording_micro_interpreter.h
index eb443fc6fd1..0a579b0be8e 100644
--- a/tensorflow/lite/micro/recording_micro_interpreter.h
+++ b/tensorflow/lite/micro/recording_micro_interpreter.h
@@ -45,6 +45,13 @@ class RecordingMicroInterpreter : public MicroInterpreter {
         recording_micro_allocator_(
             static_cast<const RecordingMicroAllocator&>(allocator())) {}
 
+  RecordingMicroInterpreter(const Model* model,
+                            const MicroOpResolver& op_resolver,
+                            RecordingMicroAllocator* allocator,
+                            ErrorReporter* error_reporter)
+      : MicroInterpreter(model, op_resolver, allocator, error_reporter),
+        recording_micro_allocator_(*allocator) {}
+
   const RecordingMicroAllocator& GetMicroAllocator() const {
     return recording_micro_allocator_;
   }
diff --git a/tensorflow/lite/micro/stm32f4HAL/debug_log.cc b/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
index 4be3b40e782..6e1936af8fb 100644
--- a/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
+++ b/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
@@ -22,6 +22,10 @@ limitations under the License.
 
 extern UART_HandleTypeDef DEBUG_UART_HANDLE;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef __GNUC__
 int __io_putchar(int ch) {
   HAL_UART_Transmit(&DEBUG_UART_HANDLE, (uint8_t *)&ch, 1, HAL_MAX_DELAY);
@@ -36,4 +40,8 @@ int fputc(int ch, FILE *f) {
 }
 #endif /* __GNUC__ */
 
-extern "C" void DebugLog(const char *s) { fprintf(stderr, "%s", s); }
+void DebugLog(const char *s) { fprintf(stderr, "%s", s); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index 42f2666ba9b..fc40440b105 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -97,7 +97,13 @@ bool DelegateProviders::InitFromCmdlineArgs(int* argc, const char** argv) {
     auto one_flags = one->CreateFlags(&params_);
     flags.insert(flags.end(), one_flags.begin(), one_flags.end());
   }
-  return Flags::Parse(argc, argv, flags);
+
+  const bool parse_result = Flags::Parse(argc, argv, flags);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flags);
+    TFLITE_LOG(ERROR) << usage;
+  }
+  return parse_result;
 }
 
 TfLiteDelegatePtr DelegateProviders::CreateDelegate(
diff --git a/tensorflow/lite/tools/evaluation/tasks/BUILD b/tensorflow/lite/tools/evaluation/tasks/BUILD
index d8daf170331..5272542f045 100644
--- a/tensorflow/lite/tools/evaluation/tasks/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/BUILD
@@ -10,10 +10,14 @@ package(
 
 cc_library(
     name = "task_executor",
+    srcs = ["task_executor.cc"],
     hdrs = ["task_executor.h"],
     copts = tflite_copts(),
     linkopts = task_linkopts(),
     deps = [
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
index b8f77d72acb..dc5f8237f6a 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
@@ -26,7 +26,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
-        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 765e8fc6465..73491457f38 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/object_detection_stage.h"
@@ -49,11 +48,14 @@ std::string GetNameFromPath(const std::string& str) {
 
 class CocoObjectDetection : public TaskExecutor {
  public:
-  CocoObjectDetection(int* argc, char* argv[]);
+  CocoObjectDetection() : debug_mode_(false), num_interpreter_threads_(1) {}
   ~CocoObjectDetection() override {}
 
+ protected:
+  std::vector<Flag> GetFlags() final;
+
   // If the run is successful, the latest metrics will be returned.
-  absl::optional<EvaluationStageMetrics> Run() final;
+  absl::optional<EvaluationStageMetrics> RunImpl() final;
 
  private:
   void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
@@ -68,8 +70,7 @@ class CocoObjectDetection : public TaskExecutor {
   DelegateProviders delegate_providers_;
 };
 
-CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
-    : debug_mode_(false), num_interpreter_threads_(1) {
+std::vector<Flag> CocoObjectDetection::GetFlags() {
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
                                "Path to test tflite model file."),
@@ -105,12 +106,10 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
   };
-  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
-  DelegateProviders delegate_providers;
-  delegate_providers.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+  return flag_list;
 }
 
-absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
+absl::optional<EvaluationStageMetrics> CocoObjectDetection::RunImpl() {
   // Process images in filename-sorted order.
   std::vector<std::string> image_paths;
   if (GetSortedFileNames(StripTrailingSlashes(ground_truth_images_path_),
@@ -224,8 +223,8 @@ void CocoObjectDetection::OutputResult(
                    << precision_metrics.overall_mean_average_precision();
 }
 
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
-  return std::unique_ptr<TaskExecutor>(new CocoObjectDetection(argc, argv));
+std::unique_ptr<TaskExecutor> CreateTaskExecutor() {
+  return std::unique_ptr<TaskExecutor>(new CocoObjectDetection());
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
index de2a7f96311..941bbc0ff69 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
@@ -17,7 +17,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
-        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 13eeb313ad4..fdc97d44abc 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/image_classification_stage.h"
@@ -50,11 +49,14 @@ std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
 
 class ImagenetClassification : public TaskExecutor {
  public:
-  ImagenetClassification(int* argc, char* argv[]);
+  ImagenetClassification() : num_images_(0), num_interpreter_threads_(1) {}
   ~ImagenetClassification() override {}
 
+ protected:
+  std::vector<Flag> GetFlags() final;
+
   // If the run is successful, the latest metrics will be returned.
-  absl::optional<EvaluationStageMetrics> Run() final;
+  absl::optional<EvaluationStageMetrics> RunImpl() final;
 
  private:
   void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
@@ -67,11 +69,9 @@ class ImagenetClassification : public TaskExecutor {
   std::string delegate_;
   int num_images_;
   int num_interpreter_threads_;
-  DelegateProviders delegate_providers_;
 };
 
-ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
-    : num_images_(0), num_interpreter_threads_(1) {
+std::vector<Flag> ImagenetClassification::GetFlags() {
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
                                "Path to test tflite model file."),
@@ -107,11 +107,10 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
   };
-  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
-  delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+  return flag_list;
 }
 
-absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
+absl::optional<EvaluationStageMetrics> ImagenetClassification::RunImpl() {
   // Process images in filename-sorted order.
   std::vector<std::string> image_files, ground_truth_image_labels;
   if (GetSortedFileNames(StripTrailingSlashes(ground_truth_images_path_),
@@ -203,8 +202,8 @@ void ImagenetClassification::OutputResult(
   }
 }
 
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
-  return std::unique_ptr<TaskExecutor>(new ImagenetClassification(argc, argv));
+std::unique_ptr<TaskExecutor> CreateTaskExecutor() {
+  return std::unique_ptr<TaskExecutor>(new ImagenetClassification());
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
index a53872b50cb..36606722caf 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
@@ -17,7 +17,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
-        "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index 814ebe3b3bf..9a3fea0b8a3 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h"
@@ -37,11 +36,14 @@ constexpr char kDelegateFlag[] = "delegate";
 
 class InferenceDiff : public TaskExecutor {
  public:
-  InferenceDiff(int* argc, char* argv[]);
+  InferenceDiff() : num_runs_(50), num_interpreter_threads_(1) {}
   ~InferenceDiff() override {}
 
+ protected:
+  std::vector<Flag> GetFlags() final;
+
   // If the run is successful, the latest metrics will be returned.
-  absl::optional<EvaluationStageMetrics> Run() final;
+  absl::optional<EvaluationStageMetrics> RunImpl() final;
 
  private:
   void OutputResult(const EvaluationStageMetrics& latest_metrics) const;
@@ -50,11 +52,9 @@ class InferenceDiff : public TaskExecutor {
   std::string delegate_;
   int num_runs_;
   int num_interpreter_threads_;
-  DelegateProviders delegate_providers_;
 };
 
-InferenceDiff::InferenceDiff(int* argc, char* argv[])
-    : num_runs_(50), num_interpreter_threads_(1) {
+std::vector<Flag> InferenceDiff::GetFlags() {
   // Command Line Flags.
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path_,
@@ -72,11 +72,11 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
   };
-  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
-  delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
+
+  return flag_list;
 }
 
-absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
+absl::optional<EvaluationStageMetrics> InferenceDiff::RunImpl() {
   // Initialize evaluation stage.
   EvaluationStageConfig eval_config;
   eval_config.set_name("inference_profiling");
@@ -137,8 +137,8 @@ void InferenceDiff::OutputResult(
   }
 }
 
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]) {
-  return std::unique_ptr<TaskExecutor>(new InferenceDiff(argc, argv));
+std::unique_ptr<TaskExecutor> CreateTaskExecutor() {
+  return std::unique_ptr<TaskExecutor>(new InferenceDiff());
 }
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
new file mode 100644
index 00000000000..e62793dc6ff
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace evaluation {
+absl::optional<EvaluationStageMetrics> TaskExecutor::Run(int* argc,
+                                                         char* argv[]) {
+  auto flag_list = GetFlags();
+  bool parse_result =
+      tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flag_list);
+    TFLITE_LOG(ERROR) << usage;
+    return absl::nullopt;
+  }
+  parse_result = delegate_providers_.InitFromCmdlineArgs(
+      argc, const_cast<const char**>(argv));
+  if (!parse_result) {
+    return absl::nullopt;
+  }
+
+  std::string unconsumed_args =
+      Flags::ArgsToString(*argc, const_cast<const char**>(argv));
+  if (!unconsumed_args.empty()) {
+    TFLITE_LOG(WARN) << "Unconsumed cmdline flags: " << unconsumed_args;
+  }
+
+  return RunImpl();
+}
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.h b/tensorflow/lite/tools/evaluation/tasks/task_executor.h
index b50e7d6d03f..caa84283098 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor.h
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_H_
 
 #include "absl/types/optional.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 
 namespace tflite {
@@ -25,13 +27,22 @@ namespace evaluation {
 class TaskExecutor {
  public:
   virtual ~TaskExecutor() {}
+
   // If the run is successful, the latest metrics will be returned.
-  virtual absl::optional<EvaluationStageMetrics> Run() = 0;
+  absl::optional<EvaluationStageMetrics> Run(int* argc, char* argv[]);
+
+ protected:
+  // Returns a list of commandline flags that this task defines.
+  virtual std::vector<Flag> GetFlags() = 0;
+
+  virtual absl::optional<EvaluationStageMetrics> RunImpl() = 0;
+
+  DelegateProviders delegate_providers_;
 };
 
 // Just a declaration. In order to avoid the boilerpolate main-function code,
 // every evaluation task should define this function.
-std::unique_ptr<TaskExecutor> CreateTaskExecutor(int* argc, char* argv[]);
+std::unique_ptr<TaskExecutor> CreateTaskExecutor();
 }  // namespace evaluation
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
index 6ef1a6724b7..97f8e263659 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor_main.cc
@@ -18,12 +18,12 @@ limitations under the License.
 
 // This could serve as the main function for all eval tools.
 int main(int argc, char* argv[]) {
-  auto task_executor = tflite::evaluation::CreateTaskExecutor(&argc, argv);
+  auto task_executor = tflite::evaluation::CreateTaskExecutor();
   if (task_executor == nullptr) {
     TFLITE_LOG(ERROR) << "Could not create the task evaluation!";
     return EXIT_FAILURE;
   }
-  const auto metrics = task_executor->Run();
+  const auto metrics = task_executor->Run(&argc, argv);
   if (!metrics.has_value()) {
     TFLITE_LOG(ERROR) << "Could not run the task evaluation!";
     return EXIT_FAILURE;
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 09ce81c1d97..ed1ef07d8d3 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -222,8 +222,8 @@ inline void LstmStepWithAuxInput(
         cell_state_ptr);
   }
   if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
+    tensor_utils::CwiseClipping(cell_state_ptr, n_batch * n_cell,
+                                params->cell_clip);
   }
 
   // For each batch and cell: update the output gate.
@@ -268,8 +268,8 @@ inline void LstmStepWithAuxInput(
         projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
         output_state_ptr);
     if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
-                               params->proj_clip, output_state_ptr);
+      tensor_utils::CwiseClipping(output_state_ptr, n_batch * n_output,
+                                  params->proj_clip);
     }
   } else {
     std::copy_n(output_gate_scratch, n_batch * n_output, output_state_ptr);
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 85ee8de5635..d2c46f64f18 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -751,6 +751,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
@@ -1171,6 +1172,7 @@ distribute_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/types",
@@ -1274,28 +1276,6 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
-    name = "custom_training_loop_models_test",
-    srcs = ["custom_training_loop_models_test.py"],
-    main = "custom_training_loop_models_test.py",
-    tags = [
-        "multi_and_single_gpu",
-    ],
-    tpu_tags = [
-        "no_oss",  # b/153615544.
-    ],
-    deps = [
-        ":combinations",
-        ":strategy_combinations",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 distribute_py_test(
     name = "custom_training_loop_optimizer_test",
     srcs = ["custom_training_loop_optimizer_test.py"],
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index f32427b88e0..d7893ae54f8 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1217,20 +1217,85 @@ class StrategyBase(object):
     return self.run(fn, args=args, kwargs=kwargs, options=options)
 
   def reduce(self, reduce_op, value, axis):
-    """Reduce `value` across replicas.
+    """Reduce `value` across replicas and return result on current device.
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> def step_fn():
+    ...   i = tf.distribute.get_replica_context().replica_id_in_sync_group
+    ...   return tf.identity(i)
+    >>>
+    >>> per_replica_result = strategy.run(step_fn)
+    >>> total = strategy.reduce("SUM", per_replica_result, axis=None)
+    >>> total
+    <tf.Tensor: shape=(), dtype=int32, numpy=0>
+
+    To see how this would look with multiple replicas, consider the same
+    example with MirroredStrategy with 2 GPUs:
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy(devices=["gpu:0", "gpu:1"])
+    def step_fn():
+      i = tf.distribute.get_replica_context().replica_id_in_sync_group
+      return tf.identity(i)
+
+    per_replica_result = strategy.run(step_fn)
+    # Check devices on which per replica result is:
+    strategy.experimental_local_results(per_replica_result)[0].device
+    # /job:localhost/replica:0/task:0/device:GPU:0
+    strategy.experimental_local_results(per_replica_result)[1].device
+    # /job:localhost/replica:0/task:0/device:GPU:1
+
+    total = strategy.reduce("SUM", per_replica_result, axis=None)
+    # Check device on which reduced result is:
+    total.device
+    # /job:localhost/replica:0/task:0/device:CPU:0
+
+    ```
+
+    This API is typically used for aggregating the results returned from
+    different replicas, for reporting etc. For example, loss computed from
+    different replicas can be averaged using this API before printing.
+
+    Note: The result is copied to the "current" device - which would typically
+    be the CPU of the worker on which the program is running. For `TPUStrategy`,
+    it is the first TPU host. For multi client `MultiWorkerMirroredStrategy`,
+    this is CPU of each worker.
+
+    There are a number of different tf.distribute APIs for reducing values
+    across replicas:
+    * `tf.distribute.ReplicaContext.all_reduce`: This differs from
+    `Strategy.reduce` in that it is for replica context and does
+    not copy the results to the host device. `all_reduce` should be typically
+    used for reductions inside the training step such as gradients.
+    * `tf.distribute.StrategyExtended.reduce_to` and
+    `tf.distribute.StrategyExtended.batch_reduce_to`: These APIs are more
+    advanced versions of `Strategy.reduce` as they allow customizing the
+    destination of the result. They are also called in cross replica context.
+
+    _What should axis be?_
 
     Given a per-replica value returned by `run`, say a
     per-example loss, the batch will be divided across all the replicas.  This
     function allows you to aggregate across replicas and optionally also across
-    batch elements.  For example, if you have a global batch size of 8 and 2
+    batch elements by specifying the axis parameter accordingly.
+
+    For example, if you have a global batch size of 8 and 2
     replicas, values for examples `[0, 1, 2, 3]` will be on replica 0 and
-    `[4, 5, 6, 7]` will be on replica 1. By default, `reduce` will just
-    aggregate across replicas, returning `[0+4, 1+5, 2+6, 3+7]`. This is useful
-    when each replica is computing a scalar or some other value that doesn't
-    have a "batch" dimension (like a gradient). More often you will want to
-    aggregate across the global batch, which you can get by specifying the batch
+    `[4, 5, 6, 7]` will be on replica 1. With `axis=None`, `reduce` will
+    aggregate only across replicas, returning `[0+4, 1+5, 2+6, 3+7]`.
+    This is useful when each replica is computing a scalar or some other value
+    that doesn't have a "batch" dimension (like a gradient or loss).
+    ```
+    strategy.reduce("sum", per_replica_result, axis=None)
+    ```
+
+    Sometimes, you will want to aggregate across both the global batch _and_
+    all replicas. You can get this behavior by specifying the batch
     dimension as the `axis`, typically `axis=0`. In this case it would return a
     scalar `0+1+2+3+4+5+6+7`.
+    ```
+    strategy.reduce("sum", per_replica_result, axis=0)
+    ```
 
     If there is a last partial batch, you will need to specify an axis so
     that the resulting shape is consistent across replicas. So if the last
@@ -1242,11 +1307,13 @@ class StrategyBase(object):
     which will weigh some values `1/8` and others `1/4`.
 
     Args:
-      reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
-        be combined.
-      value: A "per replica" value, e.g. returned by `run` to
-        be combined into a single tensor.
-      axis: Specifies the dimension to reduce along within each
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
+      value: a `tf.distribute.DistributeValues` instance, e.g. returned by
+        `Strategy.run`, to be combined into a single tensor. It can also be a
+        regular tensor when used with `OneDeviceStrategy` or default strategy.
+      axis: specifies the dimension to reduce along within each
         replica's tensor. Should typically be set to the batch dimension, or
         `None` to only reduce across replicas (e.g. if the tensor has no batch
         dimension).
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index e4a362a92c6..64089e54bfa 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -1216,12 +1216,6 @@ class DistributedDatasetsFromFunction(_IterableInput):
   @property
   def element_spec(self):
     """The type specification of an element of this dataset."""
-    if self._element_spec is None:
-      raise ValueError("You must create an iterator before calling "
-                       "`element_spec` on the distributed dataset or iterator. "
-                       "This is because the dataset function is not called "
-                       "before an iterator is created.")
-
     return self._element_spec
 
 
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index db31b9c4dd4..4971eea93ad 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,8 +67,7 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo',
-    ['task_type', 'is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -124,6 +123,7 @@ class MultiProcessRunner(object):
                stream_stdout=True,
                list_stdout=False,
                use_dill_for_args=True,
+               daemon=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -144,6 +144,9 @@ class MultiProcessRunner(object):
         `signal.alarm()` api. Note that this is best effort at Python level
         since Python signal handler does not get executed when it runs lower
         level C/C++ code. So it can be delayed for arbitrarily long time.
+        If any of the child process is still running when `max_run_time` is up,
+        they will be force-terminated and a `UnexpectedSubprocessExitError`
+        may be raised at `join()`.
       grpc_fail_fast: Whether GRPC connection between processes should fail
         without retrying. Defaults to None, in which case the environment
         variable is not explicitly set.
@@ -157,6 +160,7 @@ class MultiProcessRunner(object):
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
+      daemon: Whether to start processes as daemons.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -188,6 +192,7 @@ class MultiProcessRunner(object):
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
     self._use_dill_for_args = use_dill_for_args
+    self._daemon = daemon
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -268,7 +273,8 @@ class MultiProcessRunner(object):
         test_env=test_env,
         target=_ProcFunc(),
         args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args))
+              self._use_dill_for_args),
+        daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
@@ -447,11 +453,19 @@ class MultiProcessRunner(object):
       from subprocesses' stdout and stderr.
 
     Raises:
-      SubprocessTimeoutError: if not all processes report status approximatelty
-      within `timeout` seconds. When this is raised, a
-      `MultiProcessRunnerResult` object can be retrieved by
-      `SubprocessTimeoutError`'s mpr_result attribute, which has the same
-      structure as above 'Returns' section describes.
+      SubprocessTimeoutError: if not all processes report status approximately
+        within `timeout` seconds. When this is raised, a
+        `MultiProcessRunnerResult` object can be retrieved by
+        `SubprocessTimeoutError`'s mpr_result attribute, which has the same
+        structure as above 'Returns' section describes.
+      UnexpectedSubprocessExitError: If any of the subprocesses did not exit
+        properly (for example, they exit on SIGTERM or SIGKILL signal). When
+        this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+        `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
+        same structure as above 'Returns' section describes. If `max_run_time`
+        is not `None`, it is expected that some subprocesses may be
+        force-killed when `max_run_time` is up, and this is raised in those
+        cases.
       Exception: if there is an Exception propagated from any subprocess.
     """
     if self._joined:
@@ -475,14 +489,28 @@ class MultiProcessRunner(object):
     process_statuses = self._queue_to_list(self._process_status_queue)
     if not self._all_forced_terminated and len(
         process_statuses) != self._outstanding_subprocess_count:
-      raise RuntimeError(
-          'missing statuses from %d subproceses.' %
-          (self._outstanding_subprocess_count - len(process_statuses)))
+      raise UnexpectedSubprocessExitError(
+          'Missing status(es) from %d subprocess(es). See logs for details.' %
+          (self._outstanding_subprocess_count - len(process_statuses)),
+          self._get_mpr_result(process_statuses))
     for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
 
+    # Checking all the processes that are expected to exit properly.
+    for (task_type, task_id), p in self._processes.items():
+      if self._dependence_on_chief and task_type != 'chief':
+        # If _dependence_on_chief, other processes may have been
+        # forced-terminated, which is expected.
+        continue
+      # Successfully exiting process has exit code 0.
+      if p.exitcode > 0:
+        raise UnexpectedSubprocessExitError(
+            'Subprocess %s-%d exited with exit code %d. See logs for details.' %
+            (task_type, task_id, p.exitcode),
+            self._get_mpr_result(process_statuses))
+
     logging.info('Joining log reading threads.')
     for thread in self._reading_threads:
       thread.join()
@@ -518,6 +546,8 @@ class MultiProcessRunner(object):
     for (task_type, task_id), p in self._processes.items():
       try:
         os.kill(p.pid, sig)
+        logging.info('%s-%d terminated with signal %r.', task_type, task_id,
+                     sig)
       except ProcessLookupError:
         logging.info('Attempting to kill %s-%d but it does not exist.',
                      task_type, task_id)
@@ -580,7 +610,6 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type,
             is_successful=True,
             exc_info=None,
             return_value=None))
@@ -640,17 +669,9 @@ class _ProcFunc(object):
     if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
-    try:
-      with self._runtime_mode(test_env.executing_eagerly):
-        return_value = proc_func(*args, **kwargs)
-        is_successful = True
-        exc_info = None
-
-    except Exception:  # pylint: disable=broad-except
-      # Capture all exceptions to be reported to parent process.
-      return_value = None
-      is_successful = False
-      exc_info = sys.exc_info()
+    with self._runtime_mode(test_env.executing_eagerly):
+      info = _run_contained(proc_func, args, kwargs)
+      self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -659,17 +680,190 @@ class _ProcFunc(object):
       # instead of silently suppressing the error due to early bazel
       # timeout. Raising an error in the subprocess produces stack trace in
       # the log, but the program continues running.
-      raise
+      if not info.is_successful:
+        six.reraise(*info.exc_info)
 
-    finally:
-      info = _ProcessStatusInfo(
-          task_type=test_env.task_type,
-          is_successful=is_successful,
-          exc_info=exc_info,
-          return_value=return_value)
-      self._resources.process_status_queue.put(info)
       self._close_streaming()
 
+    # Exit with code 0 as it's considered successful exit at this point.
+    sys.exit(0)
+
+
+class MultiProcessPoolRunner(object):
+  """A utility class to start a process pool to simulate a cluster.
+
+  It's similar to MultiProcessRunner, but uses a pool of processes to avoid the
+  expensive initialization cost of Tensorflow.
+  """
+
+  def __init__(self, cluster_spec, initializer=None):
+    """Creates a multi-process pool runner.
+
+    Args:
+      cluster_spec: Dict for cluster spec. The following is an example of
+        cluster with three workers.
+        {"worker": ["worker0.example.com:2222",
+                    "worker1.example.com:2222",
+                    "worker2.example.com:2222"]}
+      initializer: a callable to called at the startup of worker processes.
+
+    Raises:
+      RuntimeError: if `multi_process_runner.test_main()` is not called.
+      ValueError: if there are more than one chief in the `cluster_spec`.
+    """
+    self._cluster_spec = cluster_spec
+    self._initializer = initializer
+    self._conn = {}
+    self._runner = None
+
+  def __del__(self):
+    self._reset()
+
+  def _reset(self):
+    for conn in self._conn.values():
+      conn.close()
+    self._conn = {}
+    if self._runner is not None:
+      self._runner.join()
+      self._runner = None
+
+  def _start(self):
+    """Starts the worker pool."""
+    # We need different arguments for different processes so we're passing a
+    # no-op proc_func here and use start_single_process instead.
+    #
+    # We also need to start the process pool as daemon, so that they don't block
+    # the program from exiting. Note that __del__ may not get called when
+    # there's an exception. The user may also store a pool runner in a global
+    # object to share across test cases
+
+    if dill is None:
+      raise unittest.SkipTest(
+          'TODO(b/150264776): Resolve dependency issue in CI')
+
+    self._runner = MultiProcessRunner(
+        proc_func=lambda: None,
+        cluster_spec=self._cluster_spec,
+        use_dill_for_args=False,
+        daemon=True)
+    if self._initializer:
+      initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
+    else:
+      initializer = None
+    for task_type, addresses in self._cluster_spec.items():
+      for task_id, _ in enumerate(addresses):
+        conn1, conn2 = multiprocessing.Pipe(duplex=True)
+        self._conn[(task_type, task_id)] = conn1
+        self._runner.start_single_process(
+            task_type,
+            task_id,
+            proc_func=_pool_runner_worker,
+            args=(initializer, conn2))
+
+  def run(self, proc_func, args=None, kwargs=None):
+    """Runs `proc_func` with `args` and `kwargs` on all jobs.
+
+    Args:
+      proc_func: The function to be run.
+      args: Optional positional arguments to be supplied in `proc_func`.
+      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+    Returns:
+      A list of return values.
+    """
+    if self._runner is None:
+      self._start()
+
+    # Since we start the processes as daemon they're going to be killed by
+    # SIGTERM when the program exits. We only turn on streaming during run() to
+    # avoid printing the stacktrace caused by the SIGTERM.
+    self._runner._stream_stdout = True  # pylint: disable=protected-access
+
+    try:
+      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+      for conn in self._conn.values():
+        conn.send((proc_func, args or [], kwargs or {}))
+
+      process_statuses = []
+      for (task_type, task_id), conn in self._conn.items():
+        logging.info('Waiting for the result from %s-%d', task_type, task_id)
+        try:
+          process_statuses.append(conn.recv())
+        except EOFError:
+          # This shouldn't happen due to exceptions in proc_func. This usually
+          # means bugs in the runner.
+          self._reset()
+          raise RuntimeError('Unexpected EOF. Worker process may have died. '
+                             'Please report a bug')
+
+      return_values = []
+      for process_status in process_statuses:
+        assert isinstance(process_status, _ProcessStatusInfo)
+        if not process_status.is_successful:
+          six.reraise(*process_status.exc_info)
+        if process_status.return_value is not None:
+          return_values.append(process_status.return_value)
+
+      return return_values
+    finally:
+      self._runner._stream_stdout = False  # pylint: disable=protected-access
+
+
+def _pool_runner_worker(initializer, conn):
+  """Function that runs on the workers in a pool.
+
+  It listens for callables to run and returns the result until `conn` is closed.
+  It captures the exceptions during executing the callable and return it through
+  `conn`.
+
+  Args:
+    initializer: A callable to execute during startup.
+    conn: A multiprocessing.Connection object to listen for tasks and send
+      results.
+  """
+  if initializer:
+    initializer = dill.loads(initializer)
+    initializer()
+  while True:
+    try:
+      proc_func, args, kwargs = conn.recv()
+    except EOFError:
+      break
+    proc_func = dill.loads(proc_func)
+    info = _run_contained(proc_func, args, kwargs)
+    sys.stdout.flush()
+    sys.stderr.flush()
+    conn.send(info)
+
+
+def _run_contained(proc_func, args, kwargs):
+  """Runs `proc_func` with `args` and `kwargs`.
+
+  The function returns _ProcessStatusInfo which captures the return value and
+  the exception.
+
+  Args:
+    proc_func: The function to be run.
+    args: Optional positional arguments to be supplied in `proc_func`.
+    kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+  Returns:
+    a _ProcessStatusInfo.
+  """
+  try:
+    return_value = proc_func(*args, **kwargs)
+    is_successful = True
+    exc_info = None
+  except Exception:  # pylint: disable=broad-except
+    return_value = None
+    is_successful = False
+    exc_info = sys.exc_info()
+  finally:
+    return _ProcessStatusInfo(  # pylint: disable=lost-exception
+        is_successful=is_successful,
+        exc_info=exc_info,
+        return_value=return_value)
+
 
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
@@ -684,6 +878,19 @@ class SubprocessTimeoutError(RuntimeError):
     self.mpr_result = mpr_result
 
 
+class UnexpectedSubprocessExitError(RuntimeError):
+  """An error indicating there is at least one subprocess with unexpected exit.
+
+  When this is raised, a `MultiProcessRunnerResult` object can be retrieved by
+  `UnexpectedSubprocessExitError`'s mpr_result attribute. See
+  `MultiProcessRunner.join()` for more information.
+  """
+
+  def __init__(self, msg, mpr_result):
+    super(UnexpectedSubprocessExitError, self).__init__(msg)
+    self.mpr_result = mpr_result
+
+
 def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   """Set TF_CONFIG environment variable."""
   tf_config_dict = {
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index d6e04010e34..acec6d0c999 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ctypes
 import json
 import os
+import sys
 import threading
 import time
+import unittest
+
 from absl import logging
 
 from tensorflow.python.distribute import multi_process_runner
@@ -45,7 +49,7 @@ def proc_func_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_return_args_and_kwargs(*args, **kwargs):
+def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
@@ -53,6 +57,20 @@ def proc_func_with_barrier():
   return multi_process_runner.barrier()
 
 
+def proc_func_that_returns_pid():
+  return os.getpid()
+
+
+V = None
+
+
+def proc_func_that_sets_global(val):
+  global V
+  old_val = V
+  V = val
+  return old_val
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -95,7 +113,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_return_args_and_kwargs,
+        proc_func_that_returns_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -298,6 +316,103 @@ class MultiProcessRunnerTest(test.TestCase):
     self.assertTrue(
         any('something printed' in line for line in list_to_assert))
 
+  def test_seg_fault_raises_error(self):
+
+    def proc_func_expected_to_seg_fault():
+      ctypes.string_at(0)  # Intentionally made seg fault.
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_seg_fault,
+          multi_worker_test_base.create_cluster_spec(num_workers=1),
+          list_stdout=True)
+    self.assertIn('Missing status(es) from 1 subprocess(es).',
+                  str(cm.exception))
+    list_to_assert = cm.exception.mpr_result.stdout
+    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+
+  def test_seg_fault_in_chief_raises_error(self):
+
+    def proc_func_expected_to_seg_fault():
+      if multi_worker_test_base.get_task_type() == 'worker':
+        time.sleep(10000)
+      ctypes.string_at(0)  # Intentionally made seg fault.
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_seg_fault,
+          multi_worker_test_base.create_cluster_spec(
+              has_chief=True, num_workers=1),
+          list_stdout=True)
+    self.assertIn('Subprocess chief-0 exited with exit code',
+                  str(cm.exception))
+    list_to_assert = cm.exception.mpr_result.stdout
+    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+
+  def test_non_zero_exit_code_raises_error(self):
+
+    def proc_func_expected_to_exit_with_1():
+      sys.exit(1)
+
+    with self.assertRaises(
+        multi_process_runner.UnexpectedSubprocessExitError) as cm:
+      multi_process_runner.run(
+          proc_func_expected_to_exit_with_1,
+          multi_worker_test_base.create_cluster_spec(num_workers=1))
+    self.assertIn('Missing status(es) from 1 subprocess(es).',
+                  str(cm.exception))
+
+    
+class MultiProcessPoolRunnerTest(test.TestCase):
+
+  def test_same_process_across_runs(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    for _ in range(3):
+      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_exceptions_in_sub_process(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+      runner.run(proc_func_that_errors)
+    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_tf_config(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    result = runner.run(proc_func_that_adds_task_type_in_return_data)
+
+    job_count_dict = {'worker': 2, 'chief': 1}
+    for data in result:
+      job_count_dict[data] -= 1
+
+    self.assertEqual(job_count_dict['worker'], 0)
+    self.assertEqual(job_count_dict['chief'], 0)
+
+  @unittest.expectedFailure
+  def test_exception_in_main_process(self):
+    # When there's an exception in the main process, __del__() is not called.
+    # This test is to verify MultiProcessPoolRunner can cope with __del__() not
+    # being called.
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    runner.run(proc_func_that_returns_pid)
+    raise ValueError('failure')
+
+  def test_initializer(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(
+        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
+    result = runner.run(proc_func_that_sets_global, args=(2,))
+    self.assertAllEqual(result, [1, 1])
+
 
 if __name__ == '__main__':
   multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/packed_distributed_variable.py b/tensorflow/python/distribute/packed_distributed_variable.py
index c249b8efc1c..4c9433dc164 100644
--- a/tensorflow/python/distribute/packed_distributed_variable.py
+++ b/tensorflow/python/distribute/packed_distributed_variable.py
@@ -108,6 +108,10 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
     else:
       return self._handle
 
+  @property
+  def packed_handle(self):
+    return self._handle
+
   def _read_variable_op(self):
     if context.executing_eagerly():
       return self.get_var_on_current_device().value()
diff --git a/tensorflow/python/distribute/saved_model_save_load_test.py b/tensorflow/python/distribute/saved_model_save_load_test.py
index 23050a612f5..2b753c1e1c8 100644
--- a/tensorflow/python/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/distribute/saved_model_save_load_test.py
@@ -70,6 +70,20 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
+  @combinations.generate(
+      combinations.times(test_base.simple_models_with_strategies(),
+                         combinations.combine(save_in_scope=[True, False])))
+  def test_no_variable_device_placement(self, model_and_input, distribution,
+                                        save_in_scope):
+    saved_dir = self.run_test_save_strategy(model_and_input, distribution,
+                                            save_in_scope)
+    func = saved_model.load(saved_dir)
+    concrete_function = func.signatures[test_base._DEFAULT_FUNCTION_KEY]
+    for f in concrete_function.graph.as_graph_def().library.function:
+      for n in f.node_def:
+        if n.op == 'ReadVariableOp':
+          self.assertEmpty(n.device)
+
 
 class SavedModelTFModuleTest(test_base.TestSavedModelBase):
 
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index 70ea582baff..1fab0f2b0bd 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -274,3 +274,20 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
     tolerance = get_tolerance(distribution_for_saving,
                               distribution_for_restoring)
     self.assertAllClose(result_before_save, load_result, atol=tolerance)
+
+  def run_test_save_strategy(self, model_and_input,
+                             distribution, save_in_scope):
+    """Save a model with DS."""
+    saved_dir = os.path.join(self.get_temp_dir(), '3')
+    with distribution.scope():
+      model = model_and_input.get_model()
+      x_train, y_train, _ = model_and_input.get_data()
+      batch_size = model_and_input.get_batch_size()
+      self._train_model(model, x_train, y_train, batch_size)
+
+    if save_in_scope:
+      with distribution.scope():
+        self._save_model(model, saved_dir)
+    else:
+      self._save_model(model, saved_dir)
+    return saved_dir
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 7070fbbf18f..7744364c544 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -156,6 +156,8 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
     with strategy.scope():
       self.assertIs(strategy.cluster_resolver, resolver)
     self.assertTrue(hasattr(resolver, 'cluster_spec'))
+    if isinstance(strategy, TPUStrategy):
+      self.skipTest('b/159747888')
     self.assertTrue(hasattr(resolver, 'environment'))
     self.assertTrue(hasattr(resolver, 'master'))
     self.assertTrue(hasattr(resolver, 'num_accelerators'))
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 142743a6ec2..850981e073e 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -123,6 +123,15 @@ class TPUTest(test.TestCase):
       result = bar() + 1
       self.assertAllEqual(result, 2)
 
+  def test_on_demand_op_with_dynamic_output(self):
+    with ops.device("/device:TPU:0"):
+      where_output = array_ops.where([True, False, True])
+    self.assertAllEqual(where_output, [[0], [2]])
+
+    with ops.device("/device:TPU:0"):
+      repeat_output = array_ops.repeat(math_ops.range(2), [1, 4])
+    self.assertAllEqual(repeat_output, [0, 1, 1, 1, 1])
+
 
 @parameterized.named_parameters([("PackedVar", True), ("", False)])
 class TPUStrategyTest(test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 37643e03b18..35f040edc83 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
@@ -472,11 +473,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._var_policy = var_policy
 
-  @property
-  def _devices(self):
-    if self._packed_var is not None:
-      return tuple(d for d in self._packed_var.devices)
-    return tuple(v.device for v in self._values)
+  def _use_packed_variable(self):
+    # Don't use packed variable when under a SaveContext to avoid explicit
+    # device placement on variable consuming ops.
+    return self._packed_var is not None and not save_context.in_save_context()
 
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
@@ -488,7 +488,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
-    if self._packed_var is not None:
+    if self._use_packed_variable():
       return self._packed_var.is_initialized()
     result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
@@ -562,7 +562,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def _packed_variable(self):
-    return self._packed_var
+    if self._use_packed_variable():
+      return self._packed_var
+    return None
 
   @property
   def handle(self):
@@ -571,7 +573,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
     else:
-      if self._packed_var is not None:
+      if self._use_packed_variable():
         return self._packed_var.handle
       return self._values[replica_id].handle
 
@@ -623,7 +625,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   def _get_replica(self, replica_id):
     """Returns the value on a device with the given replica_id."""
-    if self._packed_var is not None:
+    if self._use_packed_variable():
       return self._packed_var.on_device(self._devices[replica_id])
     return self._values[replica_id]
 
@@ -844,8 +846,9 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       obj_map[v] = new_obj
       resource_map[v.handle] = new_obj.handle
     obj_map[self] = new_obj
-    resource_map[self.handle] = new_obj.handle
     resource_map[self] = new_obj.handle
+    if self._packed_var is not None:
+      resource_map[self._packed_var.packed_handle] = new_obj.handle
     return obj_map, resource_map
 
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index d0e3eec22a8..69884a06814 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -55,6 +55,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model.model_utils import mode_keys
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
@@ -753,6 +754,16 @@ class PackedDistributedVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(val.device, devices[i])
         self.assertEqual(self.evaluate(val.read_value()), i)
 
+  def testIgnorePackedVariableInSaveContext(self, distribution):
+    distribution._enable_packed_variable_in_eager_mode = True
+    with distribution.scope():
+      v = variables_lib.Variable(0)
+      self.assertIsInstance(
+          v._packed_variable, packed.PackedDistributedVariable)
+
+    with save_context.save_context():
+      self.assertIsNone(v._packed_variable)
+
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 7da45e36118..7e9f0b16334 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 349> a = {{
+  static std::array<OpIndexInfo, 348> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -222,7 +222,6 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"PlaceholderWithDefault"},
       {"PopulationCount"},
       {"PreventGradient"},
-      {"Qr"},
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index c6a8f2c5f91..4245d70b1f0 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -77,6 +77,32 @@ cuda_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "custom_training_loop_models_test",
+    srcs = ["custom_training_loop_models_test.py"],
+    main = "custom_training_loop_models_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/153615544.
+    ],
+    deps = [
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/module",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "distribute_strategy_test_lib",
     srcs = [
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
similarity index 100%
rename from tensorflow/python/distribute/custom_training_loop_models_test.py
rename to tensorflow/python/keras/distribute/custom_training_loop_models_test.py
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 51f4e3b320a..19d5ea71527 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -72,6 +72,10 @@ class Conv(Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`,  `"same"`, or `"causal"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input. `"causal"` results in causal 
+      (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -417,7 +421,10 @@ class Conv1D(Conv):
       specifying the stride length of the convolution.
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
-    padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
+    padding: One of `"valid"`, `"same"` or `"causal"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
       `"causal"` results in causal (dilated) convolutions, e.g. `output[t]`
       does not depend on `input[t+1:]`. Useful when modeling temporal data
       where the model should not violate the temporal order.
@@ -571,6 +578,9 @@ class Conv2D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
       to inputs with shape `(batch_size, height, width, channels)` while
@@ -712,6 +722,9 @@ class Conv3D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
       to inputs with shape `batch_shape + (spatial_dim1, spatial_dim2,
@@ -833,6 +846,9 @@ class Conv1DTranspose(Conv1D):
       time dimension. Specifying a stride value != 1 is incompatible with
       specifying a `dilation_rate` value != 1. Defaults to 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     output_padding: An integer specifying the amount of padding along
       the time dimension of the output tensor.
       The amount of output padding must be lower than the stride.
@@ -1083,6 +1099,9 @@ class Conv2DTranspose(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     output_padding: An integer or tuple/list of 2 integers,
       specifying the amount of padding along the height and width
       of the output tensor.
@@ -1371,19 +1390,22 @@ class Conv3DTranspose(Conv3D):
 
   Arguments:
     filters: Integer, the dimensionality of the output space
-        (i.e. the number of output filters in the convolution).
+      (i.e. the number of output filters in the convolution).
     kernel_size: An integer or tuple/list of 3 integers, specifying the
-        depth, height and width of the 3D convolution window.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
     strides: An integer or tuple/list of 3 integers,
-        specifying the strides of the convolution along the depth, height
-          and width.
-        Can be a single integer to specify the same value for
-        all spatial dimensions.
-        Specifying any stride value != 1 is incompatible with specifying
-        any `dilation_rate` value != 1.
+      specifying the strides of the convolution along the depth, height
+        and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     output_padding: An integer or tuple/list of 3 integers,
       specifying the amount of padding along the depth, height, and
       width.
@@ -1681,6 +1703,9 @@ class SeparableConv(Conv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1885,6 +1910,10 @@ class SeparableConv1D(SeparableConv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input. `"causal"` results in causal 
+      (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -2070,6 +2099,9 @@ class SeparableConv2D(SeparableConv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -2230,6 +2262,9 @@ class DepthwiseConv2D(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `'valid'` or `'same'` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     depth_multiplier: The number of depthwise convolution output channels
       for each input channel.
       The total number of depthwise convolution output
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 6c812204cba..a6f205676ca 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -434,6 +434,9 @@ class ConvLSTM2DCell(DropoutRNNCellMixin, Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       It defaults to the `image_data_format` value found in your
@@ -710,6 +713,9 @@ class ConvLSTM2D(ConvRNN2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 3e9c0f9c0a3..c33c88f3a3d 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -67,6 +67,7 @@ class LocallyConnected1D(Layer):
           any `dilation_rate` value != 1.
       padding: Currently only supports `"valid"` (case-insensitive).
           `"same"` may be supported in the future.
+          `"valid"` means no padding.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
@@ -375,6 +376,7 @@ class LocallyConnected2D(Layer):
           all spatial dimensions.
       padding: Currently only support `"valid"` (case-insensitive).
           `"same"` will be supported in future.
+          `"valid"` means no padding.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index ff7d157acad..51dc5131a8a 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -164,8 +164,9 @@ class MaxPooling1D(Pooling1D):
       for each pooling step.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      "valid" adds no padding.  "same" adds padding such that if the stride
-      is 1, the output shape is the same as the input shape.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -209,6 +210,9 @@ class AveragePooling1D(Pooling1D):
       E.g. 2 will halve the input.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -419,8 +423,9 @@ class MaxPooling2D(Pooling2D):
       Strides values.  Specifies how far the pooling window moves
       for each pooling step. If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      "valid" adds no zero padding.  "same" adds padding such that if the stride
-      is 1, the output shape is the same as input shape.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -475,6 +480,9 @@ class AveragePooling2D(Pooling2D):
       Strides values.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -617,6 +625,9 @@ class MaxPooling3D(Pooling3D):
       `(2, 2, 2)` will halve the size of the 3D input in each dimension.
     strides: tuple of 3 integers, or None. Strides values.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -667,6 +678,9 @@ class AveragePooling3D(Pooling3D):
       `(2, 2, 2)` will halve the size of the 3D input in each dimension.
     strides: tuple of 3 integers, or None. Strides values.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 6916712d52c..9adf97d1fa5 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -369,6 +369,7 @@ distribute_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    tpu_tags = ["no_oss"],
     deps = [
         ":category_crossing",
         "//tensorflow/python/distribute:combinations",
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
index 0b93c1d57c6..7fc2b42c919 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -40,9 +40,10 @@ class ImagePreprocessingDistributionTest(
     preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_distribution(self, distribution):
-    np_images = np.random.random((1000, 32, 32, 3)).astype(np.float32)
+    # TODO(b/159738418): large image input causes OOM in ubuntu multi gpu.
+    np_images = np.random.random((32, 32, 32, 3)).astype(np.float32)
     image_dataset = dataset_ops.Dataset.from_tensor_slices(np_images).batch(
-        32, drop_remainder=True)
+        16, drop_remainder=True)
 
     with distribution.scope():
       input_data = keras.Input(shape=(32, 32, 3), dtype=dtypes.float32)
@@ -58,7 +59,7 @@ class ImagePreprocessingDistributionTest(
       output = flatten_layer(preprocessed_image)
       cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
       output = cls_layer(output)
-      model = keras.Model(inputs=input_data, outputs=preprocessed_image)
+      model = keras.Model(inputs=input_data, outputs=output)
     model.compile(loss="binary_crossentropy")
     _ = model.predict(image_dataset)
 
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional.py b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
index 4c91251a0e7..4fd53531fd1 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
@@ -46,6 +46,9 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -157,6 +160,9 @@ def conv1d(inputs,
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -242,6 +248,9 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -360,6 +369,9 @@ def conv2d(inputs,
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -449,6 +461,9 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -568,6 +583,9 @@ def conv3d(inputs,
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -652,6 +670,9 @@ class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -761,6 +782,9 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -897,6 +921,9 @@ def separable_conv1d(inputs,
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1019,6 +1046,9 @@ def separable_conv2d(inputs,
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1117,6 +1147,9 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1223,6 +1256,9 @@ def conv2d_transpose(inputs,
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1295,6 +1331,9 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
       Can be a single integer to specify the same value for all spatial
       dimensions.
     padding: One of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1396,6 +1435,9 @@ def conv3d_transpose(inputs,
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
diff --git a/tensorflow/python/keras/tests/integration_test.py b/tensorflow/python/keras/tests/integration_test.py
index 8e4d38c1a6a..64a7b694355 100644
--- a/tensorflow/python/keras/tests/integration_test.py
+++ b/tensorflow/python/keras/tests/integration_test.py
@@ -160,10 +160,6 @@ class SequentialIntegrationTest(KerasIntegrationTest):
     model.pop()
     model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
 
-    # TODO(b/134523282): There is an bug with Sequential models, so the model
-    # must be marked as compiled=False to ensure the next compile goes through.
-    model._is_compiled = False
-
     model.compile(
         loss='categorical_crossentropy',
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index f38fdc18252..e8ee866d958 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -264,6 +264,9 @@ def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
       receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
 
   Returns:
     A boolean 2N-D `np.ndarray` of shape
@@ -338,6 +341,9 @@ def conv_kernel_idxs(input_shape, kernel_shape, strides, padding, filters_in,
       receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     filters_in: `int`, number if filters in the input to the layer.
     filters_out: `int', number if filters in the output of the layer.
     data_format: string, "channels_first" or "channels_last".
@@ -430,6 +436,9 @@ def conv_connected_inputs(input_shape, kernel_shape, output_position, strides,
       in the output of the convolution.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
 
   Returns:
     N ranges `[[p_in_left1, ..., p_in_right1], ...,
@@ -468,6 +477,9 @@ def conv_output_shape(input_shape, kernel_shape, strides, padding):
       receptive field.
     strides: tuple of size N, strides along each spatial dimension.
     padding: type of padding, string `"same"` or `"valid"`.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
 
   Returns:
     tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index d5337c183a6..0c291dbd940 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -278,14 +278,13 @@ if __name__ == "__main__":
                                     use_static_shape))
 
   # TODO(pfau): Get working with complex types.
-  # TODO(pfau): Get working with full_matrices when rows != cols
-  # TODO(pfau): Get working when rows < cols
+  # TODO(pfau): Get working with full_matrices when rows > cols
   # TODO(pfau): Get working with shapeholders (dynamic shapes)
   for full_matrices in False, True:
     for dtype in np.float32, np.float64:
       for rows in 1, 2, 5, 10:
         for cols in 1, 2, 5, 10:
-          if rows == cols or (not full_matrices and rows > cols):
+          if rows <= cols or (not full_matrices and rows > cols):
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s" % (dtype.__name__,
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 437e28e7e6b..53708dde221 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -493,15 +493,10 @@ def _QrGrad(op, dq, dr):
   if (r.shape.ndims is None or r.shape.as_list()[-2] is None or
       r.shape.as_list()[-1] is None):
     raise NotImplementedError("QrGrad not implemented with dynamic shapes.")
-  if r.shape.dims[-2].value != r.shape.dims[-1].value:
+  if (r.shape.dims[-2].value > r.shape.dims[-1].value and
+      q.shape.dims[-2].value == q.shape.dims[-1].value):
     raise NotImplementedError("QrGrad not implemented when ncols > nrows "
-                              "or full_matrices is true and ncols != nrows.")
-
-  qdq = math_ops.matmul(q, dq, adjoint_a=True)
-  qdq_ = qdq - _linalg.adjoint(qdq)
-  rdr = math_ops.matmul(r, dr, adjoint_b=True)
-  rdr_ = rdr - _linalg.adjoint(rdr)
-  tril = array_ops.matrix_band_part(qdq_ + rdr_, -1, 0)
+                              "and full_matrices is true.")
 
   def _TriangularSolve(x, r):
     """Equiv to matmul(x, adjoint(matrix_inverse(r))) if r is upper-tri."""
@@ -509,9 +504,37 @@ def _QrGrad(op, dq, dr):
         linalg_ops.matrix_triangular_solve(
             r, _linalg.adjoint(x), lower=False, adjoint=False))
 
-  grad_a = math_ops.matmul(q, dr + _TriangularSolve(tril, r))
-  grad_b = _TriangularSolve(dq - math_ops.matmul(q, qdq), r)
-  return grad_a + grad_b
+  def _QrGradSquareAndDeepMatrices(q, r, dq, dr):
+    """Gradient for matrix orders num_rows >= num_cols
+
+    and full_matrices is false.
+    """
+    qdq = math_ops.matmul(q, dq, adjoint_a=True)
+    qdq_ = qdq - _linalg.adjoint(qdq)
+    rdr = math_ops.matmul(r, dr, adjoint_b=True)
+    rdr_ = rdr - _linalg.adjoint(rdr)
+    tril = array_ops.matrix_band_part(qdq_ + rdr_, -1, 0)
+
+    grad_a = math_ops.matmul(q, dr + _TriangularSolve(tril, r))
+    grad_b = _TriangularSolve(dq - math_ops.matmul(q, qdq), r)
+    return grad_a + grad_b
+
+  num_rows, num_cols = q.shape.dims[-2].value, r.shape.dims[-1]
+
+  if num_rows >= num_cols:
+    return _QrGradSquareAndDeepMatrices(q, r, dq, dr)
+
+  # Partition a = [x, y], r = [u, v] and reduce to the square case
+  a = op.inputs[0]
+  y = a[..., :, num_rows:]
+  u = r[..., :, :num_rows]
+  dv = dr[..., :, num_rows:]
+  du = dr[..., :, :num_rows]
+  dy = math_ops.matmul(q, dv)
+  dx = _QrGradSquareAndDeepMatrices(q, u,
+                                    dq + math_ops.matmul(y, dv, adjoint_b=True),
+                                    du)
+  return array_ops.concat([dx, dy], axis=-1)
 
 
 @ops.RegisterGradient("MatrixSolve")
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 1318f575737..5a9a63637f6 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -940,6 +940,9 @@ def convolution(
     filter: An (N+2)-D `Tensor` with the same type as `input` and shape
       `spatial_filter_shape + [in_channels, out_channels]`.
     padding: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
+      `"valid"` means no padding. `"same"` results in padding evenly to 
+      the left/right or up/down of the input such that output has the same 
+      height/width dimension as the input.
     strides: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
       Defaults to [1]*N.  If any value of strides is > 1, then all values of
       dilation_rate must be 1.
diff --git a/tensorflow/python/profiler/internal/python_hooks.cc b/tensorflow/python/profiler/internal/python_hooks.cc
index 73bc3731290..33e182f8de0 100644
--- a/tensorflow/python/profiler/internal/python_hooks.cc
+++ b/tensorflow/python/profiler/internal/python_hooks.cc
@@ -46,7 +46,7 @@ PythonHooks* PythonHooks::GetSingleton() {
 }
 
 void PythonHooks::Start(const PythonHooksOptions& option) {
-  DCHECK(Py_IsInitialized());
+  if (!Py_IsInitialized()) return;
   if (option.enable_python_traceme || option.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
     if (option.enable_trace_python_function) {
@@ -200,8 +200,12 @@ void PythonHooks::ClearProfilerInAllThreads() {
 void PythonHooks::EnableTraceMe(bool enable) {
   const char* kModuleName =
       "tensorflow.python.profiler.trace";
-  auto trace_module = py::module::import(kModuleName);
-  trace_module.attr("enabled") = py::bool_(enable);
+  try {
+    auto trace_module = py::module::import(kModuleName);
+    trace_module.attr("enabled") = py::bool_(enable);
+  } catch (const py::error_already_set& e) {
+    LOG(ERROR) << "Can't import " << kModuleName;
+  }
 }
 
 }  // namespace profiler
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 240b60f43f6..1fc6253f763 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -281,6 +281,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "save_context",
+    srcs = [
+        "save_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -293,6 +302,7 @@ py_library(
         ":function_serialization",
         ":nested_structure_coder",
         ":revived_types",
+        ":save_context",
         ":save_options",
         ":signature_constants",
         ":signature_def_utils",
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 802ce1d61b7..84764431b9d 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -45,6 +45,7 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import function_serialization
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
+from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
@@ -985,8 +986,11 @@ def export_meta_graph(obj, filename, signatures=None, options=None):
   ops.dismantle_graph(exported_graph)
 
 
-def _build_meta_graph(obj, export_dir, signatures, options,
-                      meta_graph_def=None):
+def _build_meta_graph_impl(obj,
+                           export_dir,
+                           signatures,
+                           options,
+                           meta_graph_def=None):
   """Creates a MetaGraph containing the resources and functions of an object."""
   if ops.inside_function():
     raise AssertionError(
@@ -1044,3 +1048,14 @@ def _build_meta_graph(obj, export_dir, signatures, options,
         graph_debug_info.SerializeToString(deterministic=True))
 
   return meta_graph_def, exported_graph, object_saver, asset_info
+
+
+def _build_meta_graph(obj,
+                      export_dir,
+                      signatures,
+                      options,
+                      meta_graph_def=None):
+  """Creates a MetaGraph under a SaveContext."""
+  with save_context.save_context():
+    return _build_meta_graph_impl(obj, export_dir, signatures, options,
+                                  meta_graph_def)
diff --git a/tensorflow/python/saved_model/save_context.py b/tensorflow/python/saved_model/save_context.py
new file mode 100644
index 00000000000..53d92587247
--- /dev/null
+++ b/tensorflow/python/saved_model/save_context.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Context for building SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import threading
+
+
+class SaveContext(threading.local):
+  """A context for building a graph of SavedModel."""
+
+  def __init__(self):
+    super(SaveContext, self).__init__()
+    self._in_save_context = False
+
+  def enter_save_context(self):
+    self._in_save_context = True
+
+  def exit_save_context(self):
+    self._in_save_context = False
+
+  def in_save_context(self):
+    return self._in_save_context
+
+_save_context = SaveContext()
+
+
+@contextlib.contextmanager
+def save_context():
+  _save_context.enter_save_context()
+  try:
+    yield
+  finally:
+    _save_context.exit_save_context()
+
+
+def in_save_context():
+  """Returns whether under a save context."""
+  return _save_context.in_save_context()
+
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 720ba6bc0c3..71c2c728a17 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -16,7 +16,7 @@ cc_library(
         "//tensorflow/c:tf_attrtype",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/core/tpu/kernels:tpu_ops_common_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
     alwayslink = True,
 )
@@ -26,8 +26,8 @@ cc_library(
     hdrs = ["tpu_node_context_c_api.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
     alwayslink = True,
 )
@@ -38,6 +38,7 @@ cc_library(
     deps = [
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
     ],
 )
 
@@ -62,77 +63,34 @@ cc_library(
     deps = ["//tensorflow/core:lib"],
 )
 
-cc_library(
-    name = "tpu_stream",
-    hdrs = ["tpu_stream.h"],
-    deps = [
-        ":c_api_conversions",
-        ":status_helper",
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_stream_interface",
-        "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
-    ],
-)
-
-cc_library(
-    name = "tpu_timer",
-    hdrs = ["tpu_timer.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
-    ],
-)
-
 cc_library(
     name = "tpu_executor",
-    srcs = ["tpu_executor.cc"],
-    hdrs = ["tpu_executor.h"],
+    srcs = [
+        "tpu_executor.cc",
+        "tpu_platform.cc",
+    ],
+    hdrs = [
+        "tpu_executor.h",
+        "tpu_platform.h",
+        "tpu_stream.h",
+        "tpu_timer.h",
+    ],
     deps = [
         ":c_api_conversions",
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_executor_interface",
-        ":tpu_platform",
         ":tpu_platform_interface",
-        ":tpu_stream",
-        ":tpu_timer",
+        ":tpu_stream_interface",
         "//tensorflow/c:tf_status",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:types",
         "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
-)
-
-cc_library(
-    name = "tpu_executor_hdrs",
-    hdrs = ["tpu_executor.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_executor_interface",
-        ":tpu_platform_hdrs",
-        ":tpu_platform_interface",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/stream_executor:stream_header",
-        "//tensorflow/stream_executor/lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "tpu_platform_hdrs",
-    hdrs = ["tpu_platform.h"],
-    deps = [
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_platform_interface",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/stream_executor:stream_header",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -158,29 +116,11 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tpu_platform",
-    srcs = ["tpu_platform.cc"],
-    hdrs = ["tpu_platform.h"],
-    deps = [
-        ":status_helper",
-        ":tpu_executor_c_api_hdrs",
-        ":tpu_executor_hdrs",
-        ":tpu_platform_interface",
-        "//tensorflow/c:tf_status",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/tpu:tpu_api",
-        "//tensorflow/stream_executor:stream",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
     name = "tpu_transfer_manager",
     srcs = ["tpu_transfer_manager_registration.cc"],
     deps = [
-        ":tpu_platform",
+        ":tpu_executor",
         ":tpu_transfer_manager_base",
         "//tensorflow/compiler/xla/service:transfer_manager",
     ],
@@ -194,8 +134,8 @@ cc_library(
         ":c_api_conversions",
         ":proto_helper",
         ":status_helper",
+        ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
-        ":tpu_platform",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -211,8 +151,8 @@ cc_library(
     srcs = ["tpu_computation_placer.cc"],
     hdrs = ["tpu_computation_placer.h"],
     deps = [
+        ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
-        ":tpu_platform",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",
     ],
diff --git a/tensorflow/stream_executor/tpu/status_helper.h b/tensorflow/stream_executor/tpu/status_helper.h
index 8fcf302edac..bc8820f5fef 100644
--- a/tensorflow/stream_executor/tpu/status_helper.h
+++ b/tensorflow/stream_executor/tpu/status_helper.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
 
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 struct StatusHelper {
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index eee69a35b23..5911d651b66 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_attrtype.h"
 #include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 
 typedef struct SE_Platform SE_Platform;
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
index d47fdf37a46..e7ca506df72 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
 
@@ -36,6 +36,7 @@ void TpuNodeContext_CloseTpuHost(SE_Status* status);
 struct TfTpu_NodeContextApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Initialize);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
   TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
 };
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 97a97a63351..db6324ecaec 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -100,7 +100,7 @@ TpuPlatform::GetUncachedExecutor(
     return status.status();
   }
   return std::make_unique<stream_executor::StreamExecutor>(
-      this, absl::make_unique<tensorflow::TpuExecutor>(this, executor),
+      this, std::make_unique<tensorflow::TpuExecutor>(this, executor),
       config.ordinal);
 }
 
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
index bd8c217ddef..02b12c7650a 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python35
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
index 0a81a90a431..e44e6ca6e18 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
index 9591d7aac34..c65167a5dc6 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py37_full/release.bat
@@ -17,4 +17,4 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
index 7a7435b3713..06599fc0d8c 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
+++ b/tensorflow/tools/ci_build/release/windows/cpu_py38_full/release.bat
@@ -17,4 +17,5 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_build_flags "--config=v2 --define=no_tensorflow_py_deps=true" --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d34e7d973d3..49cd146bed5 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -409,12 +409,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "org_sqlite",
         build_file = clean_dep("//third_party:sqlite.BUILD"),
-        sha256 = "f3c79bc9f4162d0b06fa9fe09ee6ccd23bb99ce310b792c5145f87fbcc30efca",
-        strip_prefix = "sqlite-amalgamation-3310100",
+        sha256 = "e9cec01d4519e2d49b3810615237325263fe1feaceae390ee12b4a29bd73dbe2",
+        strip_prefix = "sqlite-amalgamation-3320300",
         system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3310100.zip",
-            "https://www.sqlite.org/2020/sqlite-amalgamation-3310100.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3320300.zip",
+            "https://www.sqlite.org/2020/sqlite-amalgamation-3320300.zip",
         ],
     )
 
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "f1c671925b1c60ded3e4e7b3c6b1ec984b2d9b93"
-    LLVM_SHA256 = "57fc8f0ab46bdfdff52b03c2196d658c094bc4179cd1cf9495becf6a8466123a"
+    LLVM_COMMIT = "6507bc56216ba4441790bc581a5b76d9c2ad9774"
+    LLVM_SHA256 = "d1749ab8a32110fae83881ca6c82383632516c6fd5ffbd2c5dd1b486db224e46"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index c70ff559165..50ff746b9f2 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -561,6 +561,7 @@ filegroup(
     name = "common_target_td_sources",
     srcs = glob([
         "include/llvm/CodeGen/*.td",
+        "include/llvm/Frontend/Directive/*.td",
         "include/llvm/IR/Intrinsics*.td",
         "include/llvm/TableGen/*.td",
         "include/llvm/Target/*.td",
@@ -666,6 +667,17 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "omp_gen",
+    tbl_outs = [("--gen-directive-decls", "include/llvm/Frontend/OpenMP/OMP.h.inc")],
+    tblgen = ":llvm-tblgen",
+    td_file = "include/llvm/Frontend/OpenMP/OMP.td",
+    td_srcs = glob([
+        "include/llvm/Frontend/OpenMP/*.td",
+        "include/llvm/Frontend/Directive/*.td",
+    ]),
+)
+
 ########################## Begin generated content ##########################
 cc_library(
     name = "AArch64AsmParser",
@@ -2053,6 +2065,7 @@ cc_library(
         ":Support",
         ":TransformUtils",
         ":config",
+        ":omp_gen",
     ],
 )
 
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 8fd0a94bf64..ba9b580e53f 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1507,6 +1507,7 @@ cc_library(
         ":StandardToLLVM",
         ":Support",
         ":Transforms",
+        "@llvm-project//llvm:Support",
     ],
 )